config.hpp Source File

config.hpp Source File#

Composable Kernel: config.hpp Source File
config.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
6#if defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__) || \
7 defined(__gfx9_4_generic__)
8#define __gfx9__
9#endif
10#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx9_4_generic__)
11#define __gfx94__
12#endif
13#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || \
14 defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || \
15 defined(__gfx10_3_generic__)
16#define __gfx103__
17#endif
18#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || \
19 defined(__gfx1103__) || defined(__gfx1150__) || defined(__gfx1151__) || \
20 defined(__gfx1152__) || defined(__gfx11_generic__)
21#define __gfx11__
22#endif
23#if defined(__gfx1200__) || defined(__gfx1201__) || defined(__gfx12_generic__)
24#define __gfx12__
25#endif
26
27#include "hip/hip_version.h"
28#ifndef CK_TILE_DONT_USE_HIP_RUNTIME_HEADERS
29#include "hip/hip_runtime.h"
30#include "hip/hip_fp16.h"
31#endif
32
33#ifdef __HIPCC__
34#define CK_TILE_HOST inline __host__
35#define CK_TILE_DEVICE inline __device__
36#define CK_TILE_HOST_DEVICE inline __host__ __device__
37#define CK_TILE_DEVICE_EXTERN __device__
38#define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__
39#else
40#define CK_TILE_HOST inline
41#define CK_TILE_DEVICE inline
42#define CK_TILE_HOST_DEVICE inline
43#define CK_TILE_DEVICE_EXTERN
44#define CK_TILE_HOST_DEVICE_EXTERN
45#endif
46
47// implementing the "memory address space" attribute
48// https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces-table
49// WA for https://github.com/ROCm/composable_kernel/issues/1946
50#if 0
51#define CK_TILE_GENERIC_ADDR __attribute__((address_space(0)))
52#define CK_TILE_GLOBAL_ADDR __attribute__((address_space(1)))
53#define CK_TILE_LDS_ADDR __attribute__((address_space(3)))
54#define CK_TILE_BUF_RES_ADDR __attribute__((address_space(8)))
55#else
56#define CK_TILE_GENERIC_ADDR
57#define CK_TILE_GLOBAL_ADDR
58#define CK_TILE_LDS_ADDR
59#define CK_TILE_BUF_RES_ADDR
60#endif
61#ifndef CK_TILE_USE_CUSTOM_DATA_TYPE
62#define CK_TILE_USE_CUSTOM_DATA_TYPE 0 // custom data type will generate extra move/bfi code
63#endif
64
65#define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD 0
66#define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE_WITH_NAN 1
67#define CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE 2
68#define CK_TILE_FLOAT_TO_BFLOAT16_STANDARD_ASM 3
69#define CK_TILE_FLOAT_TO_BFLOAT16_RTA_ASM 4
70
71#ifndef CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT
72#define CK_TILE_FLOAT_TO_BFLOAT16_DEFAULT CK_TILE_FLOAT_TO_BFLOAT16_TRUNCATE
73#endif
74
75#define CK_TILE_FLOAT_TO_FP8_STANDARD 0
76#define CK_TILE_FLOAT_TO_FP8_STOCHASTIC 1
77
78#ifndef CK_TILE_FLOAT_TO_FP8_DEFAULT
79#define CK_TILE_FLOAT_TO_FP8_DEFAULT CK_TILE_FLOAT_TO_FP8_STANDARD
80#endif
81
82// in the old rocm period, we have to use tuple array implementation to implement this
83// so turn on the _USE_TUPLE if meet compiler error, otherwise _USE_ARRAY by default.
84#define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_ARRAY 0
85#define CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE 1
86#ifndef CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT
87#define CK_TILE_STATICALLY_INDEXED_ARRAY_DEFAULT CK_TILE_STATICALLY_INDEXED_ARRAY_USE_TUPLE
88#endif
89
90#define CK_TILE_THREAD_BUFFER_USE_ARRAY 0
91#define CK_TILE_THREAD_BUFFER_USE_TUPLE 1
92#ifndef CK_TILE_THREAD_BUFFER_DEFAULT
93#define CK_TILE_THREAD_BUFFER_DEFAULT CK_TILE_THREAD_BUFFER_USE_ARRAY
94#endif
95
96#ifndef CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST
97#if CK_TILE_THREAD_BUFFER_DEFAULT == CK_TILE_THREAD_BUFFER_USE_TUPLE
98// if using tuple-array as thread_buffer implementation, need to support {} brace init
99// ... with similiar behavior as array
100#define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 1
101#else
102#define CK_TILE_TUPLE_CTOR_WITH_INITIALIZER_LIST 0
103#endif
104#endif
105
106#ifndef CK_TILE_USE_LAUNCH_BOUNDS
107#define CK_TILE_USE_LAUNCH_BOUNDS 1
108#endif
109
110#ifndef CK_TILE_TIME_KERNEL
111#define CK_TILE_TIME_KERNEL 1
112#endif
113
114#define CK_TILE_MAX_THREAD_PER_BLOCK 256
115#define CK_TILE_MIN_BLOCK_PER_CU 2
116
117#ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
118#define CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
119#endif
120
121#ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
122#define CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
123#endif
124
125#ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
126#define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
127#endif
128
129#ifndef CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
130#define CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
131#endif
132
133#ifndef CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
134#define CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM 1
135#endif
136
137#ifndef CK_TILE_USE_AMD_BUFFER_LOAD
138#define CK_TILE_USE_AMD_BUFFER_LOAD 1
139#endif
140
141#ifndef CK_TILE_USE_AMD_BUFFER_STORE
142#define CK_TILE_USE_AMD_BUFFER_STORE 1
143#endif
144
145#ifndef CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER
146#define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
147#endif
148
149#ifndef CK_TILE_USE_PK4_LAYOUT_SHUFFLE
150#define CK_TILE_USE_PK4_LAYOUT_SHUFFLE 1
151#endif
152
153// buffer atomic add: floating point
154#ifndef __HIP_DEVICE_COMPILE__ // for host code
155#define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
156#elif defined(__gfx9__) || defined(__gfx12__) // for GPU code
157#define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
158#else // for GPU code
159#define CK_TILE_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
160#endif
161
162#if(defined(__gfx90a__) || defined(__gfx94__)) // for GPU code
163#define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
164#else
165#define CK_TILE_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
166#endif
167
168#ifndef CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
169#define CK_TILE_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
170#endif
171
172#ifndef CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
173#define CK_TILE_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
174#endif
175
176#ifndef CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE
177#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 1 && HIP_VERSION_PATCH >= 40091
178#define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 1
179#else
180#define CK_TILE_WORKAROUND_ROCM_6_1_SCRATCH_MEMORY_ISSUE 0
181#endif
182#endif
183
184// workaround for ROCm 6.2 and later
185#ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
186#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
187 (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
188 (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
189#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
190#else
191#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
192#endif
193#endif
194
195// use llvm builtin bf16 data type after ROCm 6.5
196#ifndef CK_TILE_USE_LLVM_BUILTIN_BF16
197#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 5 && HIP_VERSION_PATCH >= 50421) || \
198 (HIP_VERSION_MAJOR >= 7)
199#define CK_TILE_USE_LLVM_BUILTIN_BF16 1
200#else
201#define CK_TILE_USE_LLVM_BUILTIN_BF16 0
202#endif
203#endif
204
205#ifndef CK_TILE_DEBUG_LOG
206#define CK_TILE_DEBUG_LOG 0
207#endif
208
209#ifndef __HIP_DEVICE_COMPILE__ // for host code
210#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0xffffffff
211#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || \
212 defined(__gfx9__) // for GPU code
213#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x00020000
214#elif defined(__gfx103__) // for GPU code
215#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31014000
216#elif defined(__gfx11__) || defined(__gfx12__) // for GPU code
217#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD 0x31004000
218#endif
219
220#ifndef CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
221#define CK_TILE_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
222#endif
223
224#ifndef CK_TILE_USE_SUBDWORD_TILE_CAST
225#define CK_TILE_USE_SUBDWORD_TILE_CAST 0
226#endif
227
228#ifndef CK_TILE_USE_PK_FP16_TILE_CAST
229#define CK_TILE_USE_PK_FP16_TILE_CAST 0
230#endif
231
232// TODO: better solve this inside compiler
233#ifndef CK_TILE_FMHA_FWD_FAST_EXP2
234#define CK_TILE_FMHA_FWD_FAST_EXP2 0
235#endif
236
237#ifndef CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN
238#define CK_TILE_FMHA_FLOAT_TO_FLOAT16_RTN 0
239#endif
240
241#ifndef CK_TILE_BUFFER_LOAD_RAW_BF16_WA
242#define CK_TILE_BUFFER_LOAD_RAW_BF16_WA 1
243#endif
244
245// workaround: compiler not emiting reciprocal instruction frm __frcp_rn()
246#ifndef CK_TILE_WORKAROUND_SWDEV_383542
247#define CK_TILE_WORKAROUND_SWDEV_383542 1
248#endif
249
250#ifndef CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
251#define CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID 1
252#endif
253
254#ifndef CK_TILE_USE_OCP_FP8
255#if defined(__HIP_DEVICE_COMPILE__)
256#if defined(__gfx950__) || defined(__gfx12__)
257#define CK_TILE_USE_OCP_FP8 1
258#else
259#define CK_TILE_USE_OCP_FP8 0
260#endif
261#else
262#define CK_TILE_USE_OCP_FP8 0
263#endif
264#endif
265
266#ifndef CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
267#if __clang_major__ >= 20 && !(defined(__gfx103__) || defined(__gfx11__) || defined(__gfx12__))
268#define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 1
269#else
270#define CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN 0
271#endif
272#endif
273
274#ifndef CK_TILE_WA_ISSUE_2028
275#define CK_TILE_WA_ISSUE_2028 0
276#endif
277
278// Y pointed to R, we don't see a valuable use case.
279// Will enforce encoding to check Y not pointed to R if set to zero
280#ifndef CK_TILE_ENC_SUPPORT_Y_TO_R
281#define CK_TILE_ENC_SUPPORT_Y_TO_R 0
282#endif