ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector > Struct Template Reference

ThreadGroupTensorSliceTransfer_DirectLoad&lt; ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector &gt; Struct Template Reference#

Composable Kernel: ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector > Struct Template Reference
ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector > Struct Template Reference

#include <thread_group_tensor_slice_transfer_direct_load.hpp>

Public Types

using Index = MultiIndex<nDim>
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}))
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}))
using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}))
using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}))

Public Member Functions

__device__ constexpr ThreadGroupTensorSliceTransfer_DirectLoad (const SrcDesc &src_desc, const Index &src_block_slice_origin, const DstDesc &dst_desc, const Index &dst_block_slice_origin)
__device__ void SetSrcSliceOrigin (const SrcDesc &src_desc, const Index &src_slice_origin_idx)
__device__ void SetDstSliceOrigin (const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
__device__ void ResetDstSliceWindow (const DstDesc &dst_desc)
template<typename SrcBuffer, typename DstBuffer>
__device__ void Run (const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
__device__ void MoveSrcSliceWindow (const SrcDesc &src_desc, const Index &step)
template<typename DescType>
__device__ auto generate_steps (const DescType &desc, int sign)

Static Public Member Functions

static __device__ constexpr bool AreThreadClusterLengthsValid ()

Static Public Attributes

static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension()
static constexpr auto I0 = Number<0>{}
static constexpr auto I1 = Number<1>{}
static constexpr auto block_slice_lengths = BlockSliceLengths{}
static constexpr auto thread_cluster_lengths = ThreadClusterLengths{}
static constexpr auto thread_single_load_size
static constexpr auto thread_steps = thread_cluster_lengths * thread_single_load_size
static constexpr auto thread_slice_lengths = block_slice_lengths / thread_steps

Detailed Description

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
struct ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >

Transfer that uses direct load instructions to copy data from global to LDS memory.

Traditional loads first copy data from global to registers, and then from registers to LDS. Direct loads do not need an intermediate step, data is copied directly from global to LDS, without the use of additional registers.

However, the instruction has limitations:

  • each thread must copy exactly a single DWORD - 4 bytes;
  • threads within a single wavefront must write consecutive DWORDS into LDS, (data in global do not need to be contiguous, each thread might have its own offset).

To make sure that all the transfers finished, the waitcnt instruction must be used with vmcnt instead of lgkmcnt.

Limitations of the transfer class:

  • SrcData must be the same as DstData - no possibility to convert the data type in flight;
  • DstVectorDim must be the last dimension;
  • SrcVectorDim must be the last dimension if ScalarPerVector is greater than 1;
  • ScalarPerVector times the number of bytes of DstData must be equal to a single DWORD = 4B (for examlpe if DstData is fp32, then ScalarPerVector must be 1; if DstData is fp16, ScalarPerVector must be 2);
  • if ScalarPerVector is greater than 1, the contiguous dimension in src and dst must be the same dimension;
  • threads in a wavefront must write contiguous data to LDS (when wavefront size is 64, they must write 64 contiguous DWORDs) - ThreadClusterLengths must be prepared in such a way to guarantee that.

Member Typedef Documentation

◆ DstCoord

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}))

◆ DstCoordStep

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}))

◆ Index

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::Index = MultiIndex<nDim>

◆ SrcCoord

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}))

◆ SrcCoordStep

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
using ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}))

Constructor & Destructor Documentation

◆ ThreadGroupTensorSliceTransfer_DirectLoad()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ constexpr ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::ThreadGroupTensorSliceTransfer_DirectLoad ( const SrcDesc & src_desc,
const Index & src_block_slice_origin,
const DstDesc & dst_desc,
const Index & dst_block_slice_origin )
inlineconstexpr

Member Function Documentation

◆ AreThreadClusterLengthsValid()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ constexpr bool ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::AreThreadClusterLengthsValid ( )
inlinestaticconstexpr

◆ generate_steps()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
template<typename DescType>
__device__ auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::generate_steps ( const DescType & desc,
int sign )
inline

◆ MoveSrcSliceWindow()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ void ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::MoveSrcSliceWindow ( const SrcDesc & src_desc,
const Index & step )
inline

◆ ResetDstSliceWindow()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ void ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::ResetDstSliceWindow ( const DstDesc & dst_desc)
inline

◆ Run()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
template<typename SrcBuffer, typename DstBuffer>
__device__ void ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::Run ( const SrcDesc & src_desc,
const SrcBuffer & src_buf,
const DstDesc & dst_desc,
DstBuffer & dst_buf )
inline

◆ SetDstSliceOrigin()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ void ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SetDstSliceOrigin ( const DstDesc & dst_desc,
const Index & dst_slice_origin_idx )
inline

◆ SetSrcSliceOrigin()

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
__device__ void ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::SetSrcSliceOrigin ( const SrcDesc & src_desc,
const Index & src_slice_origin_idx )
inline

Member Data Documentation

◆ block_slice_lengths

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::block_slice_lengths = BlockSliceLengths{}
staticconstexpr

◆ I0

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::I0 = Number<0>{}
staticconstexpr

◆ I1

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::I1 = Number<1>{}
staticconstexpr

◆ nDim

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
index_t ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::nDim = remove_reference_t<SrcDesc>::GetNumOfDimension()
staticconstexpr

◆ thread_cluster_lengths

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::thread_cluster_lengths = ThreadClusterLengths{}
staticconstexpr

◆ thread_single_load_size

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::thread_single_load_size
staticconstexpr
Initial value:
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
Definition threadwise_tensor_slice_transfer_util.hpp:20

◆ thread_slice_lengths

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::thread_slice_lengths = block_slice_lengths / thread_steps
staticconstexpr

◆ thread_steps

template<typename ThreadGroup, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t ScalarPerVector>
auto ck::ThreadGroupTensorSliceTransfer_DirectLoad< ThreadGroup, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, SrcVectorDim, DstVectorDim, ScalarPerVector >::thread_steps = thread_cluster_lengths * thread_single_load_size
staticconstexpr

The documentation for this struct was generated from the following file: