; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX90A %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX90APLUS,GFX942 %s


define void @v_shuffle_v2bf16_v8bf16__u_u(ptr addrspace(1) inreg %ptr) {
; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__u_u:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> poison
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_u(ptr addrspace(1) inreg %ptr) {
; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__8_u:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_u(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 poison>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v4, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v5, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v5, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v5, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v5, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v5, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v5, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v5, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v5, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v5, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v6, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v7, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v7, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v6, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v7, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v7, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v7, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v7, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v7, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v7, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v7, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v7, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__15_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__15_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__15_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__15_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> zeroinitializer
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v4, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v4, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v3, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v4, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v4, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_0(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v5, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 0>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v0
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v4, v0
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v4, v0
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v0, v4, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v4, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_1(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v4, v0
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v5, v0
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v5, v0
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 1>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v3, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v4, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v4, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v4, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v4, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v4, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v4, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_2(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v5, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v5, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v5, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 2>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v2, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v1
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v4, v1
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v4, v1
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v4, v1
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v4, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v1, v4, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v4, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_3(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v5, v1
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v5, v1
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v5, v1
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 3>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v3, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v4, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v4, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v4, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v5, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v5, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v4, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v5, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v5, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v5, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v6, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v6, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v5, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v6, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v6, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_4(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v6, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v7, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v7, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 4>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v4, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v4, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v4, v2
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v5, v2
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v5, v2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v4, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v5, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v5, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v5, v2
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v6, v2
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v6, v2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v5, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v6, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v6, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_5(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v6, v2
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v7, v2
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v7, v2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 5>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v4, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v4, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v4, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v5, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v5, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v5, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v5, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v5, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v5, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v6, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v6, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v6, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v6, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v6, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v6, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_6(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v7, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v7, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v7, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 6>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v4, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v4, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v4, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v5, v3
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v5, v3
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v5, v3
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v5, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v5, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v5, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v6, v3
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v6, v3
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v6, v3
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v6, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v6, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v6, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_7(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v7, v3
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v7, v3
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v7, v3
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 7>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_8(ptr addrspace(1) inreg %ptr) {
; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__u_8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, s4, v3, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, s0, v3, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_8(ptr addrspace(1) inreg %ptr) {
; GFX9-LABEL: v_shuffle_v2bf16_v8bf16__8_8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v0, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_8(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 8>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v3
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v4
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v4
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v0, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_9(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v0
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v0
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 9>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v4, v2, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v5, v2, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v5, v2, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v5, v3, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v5, v3, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v5, v3, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v1, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_10(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 10>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v3
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v3
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v5
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v5
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v5
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v5
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v5
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v1, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v1, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v1, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v1, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_11(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v1
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v1
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 11>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v4, v0, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v4, v0, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v4, v1, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v4, v1, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v4, v1, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v6, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v6, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v5, v2, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v6, v2, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v6, v2, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v6, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v6, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v6, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v6, v3, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v6, v3, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v6, v3, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v2, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_12(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 12>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v4
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v4
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v4, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v4, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v5
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v6
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v6
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v6, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v6, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v6
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v6
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v6
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v6, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v6, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v6, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v2, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v2, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v2, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v2, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_13(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v2
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v2
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 13>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v4, v0, 16
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v5, v0, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v5, v0, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v5, v1, 16
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v5, v1, 16
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v5, v1, 16
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v6, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v7, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v7, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v6, v2, 16
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v7, v2, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v7, v2, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v7, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v7, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v7, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_alignbit_b32 v0, v7, v3, 16
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_alignbit_b32 v0, v7, v3, 16
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_alignbit_b32 v0, v7, v3, 16
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v0, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v1, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_alignbit_b32 v0, v3, v2, 16
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_14(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x5040100
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v3, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v3, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 14>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__u_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__u_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__u_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__u_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__0_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__0_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__0_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v5
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__0_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v5
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__1_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__1_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v5, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[1:4]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v4, v0, s4
; GFX900-NEXT:    global_store_dword v5, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__1_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__1_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__2_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__2_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v5
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__2_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v5
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__2_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v5
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__3_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__3_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v6, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[2:5]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v5, v1, s4
; GFX900-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__3_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[2:5]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v5, v1, s4
; GFX90A-NEXT:    global_store_dword v6, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__3_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v6, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[2:5]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v5, v1, s2
; GFX942-NEXT:    global_store_dword v6, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__4_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__4_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v6
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__4_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v7
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__4_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v7
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__5_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__5_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v7, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[3:6]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v6, v2, s4
; GFX900-NEXT:    global_store_dword v7, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__5_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v7, v2, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__5_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v7, v2, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__6_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__6_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_bfi_b32 v0, s4, v3, v7
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__6_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v3, v7
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__6_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v7
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__7_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__7_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v8, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    v_perm_b32 v0, v7, v3, s4
; GFX900-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__7_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    v_perm_b32 v0, v7, v3, s4
; GFX90A-NEXT:    global_store_dword v8, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__7_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v8, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    v_perm_b32 v0, v7, v3, s2
; GFX942-NEXT:    global_store_dword v8, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__8_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__8_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__8_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v0, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__8_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v0, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__9_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__9_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__9_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v0, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__9_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__10_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__10_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__10_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v1, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__10_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v1, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__11_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__11_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__11_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v1, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__11_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v1, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__12_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__12_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0xffff
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_bfi_b32 v0, s4, v2, v3
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__12_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_bfi_b32 v0, s4, v2, v3
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__12_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0xffff
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_bfi_b32 v0, s2, v2, v3
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__13_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__13_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX900-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__13_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s4, 0x7060302
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    v_perm_b32 v0, v3, v2, s4
; GFX90A-NEXT:    global_store_dword v4, v0, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__13_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s2, 0x7060302
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    v_perm_b32 v0, v3, v2, s2
; GFX942-NEXT:    global_store_dword v4, v0, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @v_shuffle_v2bf16_v8bf16__14_15(ptr addrspace(1) inreg %ptr) {
; GFX900-LABEL: v_shuffle_v2bf16_v8bf16__14_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    v_mov_b32_e32 v4, 0
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def v[0:3]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX900-NEXT:    s_waitcnt vmcnt(0)
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_shuffle_v2bf16_v8bf16__14_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def v[0:3]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    global_store_dword v4, v3, s[16:17]
; GFX90A-NEXT:    s_waitcnt vmcnt(0)
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: v_shuffle_v2bf16_v8bf16__14_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    v_mov_b32_e32 v4, 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def v[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    global_store_dword v4, v3, s[0:1]
; GFX942-NEXT:    s_waitcnt vmcnt(0)
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=v"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=v"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 15>
  store <2 x bfloat> %shuf, ptr addrspace(1) %ptr, align 4
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_u() {
; GFX9-LABEL: s_shuffle_v2bf16_v8bf16__u_u:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ;;#ASMSTART
; GFX9-NEXT:    ; use s8
; GFX9-NEXT:    ;;#ASMEND
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> poison
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_u() {
; GFX9-LABEL: s_shuffle_v2bf16_v8bf16__8_u:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ;;#ASMSTART
; GFX9-NEXT:    ; use s8
; GFX9-NEXT:    ;;#ASMEND
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_u() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_u:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_u:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_u:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 poison>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__15_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__15_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__15_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__15_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 15, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> zeroinitializer
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s4, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s4, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_0() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_0:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_0:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_0:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 0>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_1() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_1:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_1:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_1:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 1>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_2() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_2:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_2:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_2:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 2>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_3() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_3:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_3:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_3:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 3>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_4() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_4:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_4:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_4:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 4>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_5() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_5:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_5:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_5:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 5>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_6() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_6:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_6:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_6:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 6>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s9, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s5, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s10, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s6, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_7() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_7:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_7:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s11, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_7:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s7, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 7>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_8() {
; GFX9-LABEL: s_shuffle_v2bf16_v8bf16__u_8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ;;#ASMSTART
; GFX9-NEXT:    ; use s8
; GFX9-NEXT:    ;;#ASMEND
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 poison, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 0, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s4, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s4, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 1, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 2, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 3, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 4, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 5, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 6, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 7, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_8() {
; GFX9-LABEL: s_shuffle_v2bf16_v8bf16__8_8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    ;;#ASMSTART
; GFX9-NEXT:    ; use s8
; GFX9-NEXT:    ;;#ASMEND
; GFX9-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> poison, <2 x i32> <i32 8, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_8() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_8:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_8:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_8:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 8>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s8, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s8, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s4, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s8, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s8, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s4, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[8:11]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_nop 0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_9() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_9:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_9:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_9:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 9>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s5, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s5, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s9
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s5
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_10() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_10:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_10:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_10:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 10>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s9, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s9, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s5, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s9, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s9, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s5, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_11() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_11:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_11:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_11:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 11>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s6, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s6, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s10
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s6
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_12() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_12:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_12:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_12:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 12>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s10, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s10, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s6, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s10, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s10, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s6, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s6
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s6
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s2
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_13() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_13:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_13:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_13:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 13>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshl_b32 s8, s7, 16
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshl_b32 s8, s7, 16
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshl_b32 s8, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s11
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s7
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_14() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_14:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_14:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_14:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 14>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__u_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__u_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__u_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__u_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 poison, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__0_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__0_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__0_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__0_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 0, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__1_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__1_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s11, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__1_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s11, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__1_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s7, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 1, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__2_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__2_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__2_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__2_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 2, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__3_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__3_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__3_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__3_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 3, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__4_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__4_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__4_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__4_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 4, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__5_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__5_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__5_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__5_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 5, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__6_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__6_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__6_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s7, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__6_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s3, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 6, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__7_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__7_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[8:11]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s11, 16
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__7_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[8:11]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s11, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__7_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[4:7]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s7, 16
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 7, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__8_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__8_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__8_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__8_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 8, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__9_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__9_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s5, s7, 16
; GFX900-NEXT:    s_lshr_b32 s4, s4, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__9_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s5, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s4, s4, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s4, s5
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__9_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s1, s3, 16
; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s0, s1
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 9, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__10_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__10_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__10_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__10_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 10, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__11_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__11_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s5, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__11_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s5, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__11_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 11, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__12_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__12_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__12_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s6, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__12_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s2, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 12, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__13_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__13_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_lshr_b32 s4, s7, 16
; GFX900-NEXT:    s_lshr_b32 s5, s6, 16
; GFX900-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__13_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_lshr_b32 s4, s7, 16
; GFX90A-NEXT:    s_lshr_b32 s5, s6, 16
; GFX90A-NEXT:    s_pack_ll_b32_b16 s8, s5, s4
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__13_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
; GFX942-NEXT:    s_lshr_b32 s1, s2, 16
; GFX942-NEXT:    s_pack_ll_b32_b16 s8, s1, s0
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 13, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}

define void @s_shuffle_v2bf16_v8bf16__14_15() {
; GFX900-LABEL: s_shuffle_v2bf16_v8bf16__14_15:
; GFX900:       ; %bb.0:
; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; def s[4:7]
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_mov_b32 s8, s7
; GFX900-NEXT:    ;;#ASMSTART
; GFX900-NEXT:    ; use s8
; GFX900-NEXT:    ;;#ASMEND
; GFX900-NEXT:    s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: s_shuffle_v2bf16_v8bf16__14_15:
; GFX90A:       ; %bb.0:
; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; def s[4:7]
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_mov_b32 s8, s7
; GFX90A-NEXT:    ;;#ASMSTART
; GFX90A-NEXT:    ; use s8
; GFX90A-NEXT:    ;;#ASMEND
; GFX90A-NEXT:    s_setpc_b64 s[30:31]
;
; GFX942-LABEL: s_shuffle_v2bf16_v8bf16__14_15:
; GFX942:       ; %bb.0:
; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; def s[0:3]
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_mov_b32 s8, s3
; GFX942-NEXT:    ;;#ASMSTART
; GFX942-NEXT:    ; use s8
; GFX942-NEXT:    ;;#ASMEND
; GFX942-NEXT:    s_setpc_b64 s[30:31]
  %vec0 = call <8 x bfloat> asm "; def $0", "=s"()
  %vec1 = call <8 x bfloat> asm "; def $0", "=s"()
  %shuf = shufflevector <8 x bfloat> %vec0, <8 x bfloat> %vec1, <2 x i32> <i32 14, i32 15>
  call void asm sideeffect "; use $0", "{s8}"(<2 x bfloat> %shuf)
  ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX90APLUS: {{.*}}
