#include #include #include #include "thrust/host_vector.h" #include "thrust/device_vector.h" #include #include "common.hpp" #include "reference.hpp" #include "arch_runner.hpp" #include typedef commondx::detail::complex complex; const unsigned int size=80; const unsigned int M_TILE = 32; const unsigned int N_TILE =32;const unsigned int K_TILE = 32; //由于gemm_kernel_shared是小核 void print_device_vector( thrust::device_vector& vec) { std::cout << "num"< //这是小核 __global__ void gemm_kernel_shared(const typename GEMM::c_value_type alpha,const typename GEMM::a_value_type* a, const typename GEMM::b_value_type* b,const typename GEMM::c_value_type beta, typename GEMM::c_value_type* c, int lda, int ldb, int ldc ) { extern __shared__ __align__(16) char smem[]; // 确定本 block 负责处理矩阵哪一块 tile int block_row = blockIdx.y; int block_col = blockIdx.x; int row_start = block_row *80; int col_start = block_col *80; // 边界判断:如果 tile 超过矩阵范围,直接返回(防止越界) if (row_start >= 80 || col_start >= 80) return; // 取 A, B, C 对应 tile 起点 auto a_tile = a + row_start; auto b_tile = b + col_start * ldb; auto c_tile = c + row_start + col_start * ldc; // Make global memory tensor auto a_global_tensor = cublasdx::make_tensor(a_tile, GEMM::get_layout_gmem_a(lda)); auto b_global_tensor = cublasdx::make_tensor(b_tile, GEMM::get_layout_gmem_b(ldb)); auto c_global_tensor = cublasdx::make_tensor(c_tile , GEMM::get_layout_gmem_c(ldc)); // Make shared memory tensor auto [smem_a, smem_b, smem_c] = cublasdx::slice_shared_memory(smem); auto a_shared_tensor = cublasdx::make_tensor(smem_a, GEMM::get_layout_smem_a()); auto b_shared_tensor = cublasdx::make_tensor(smem_b, GEMM::get_layout_smem_b()); auto c_shared_tensor = cublasdx::make_tensor(smem_c, GEMM::get_layout_smem_c()); // Load data from global memory tensor to shared memory tensor using alignment = cublasdx::alignment_of; cublasdx::copy(a_global_tensor, a_shared_tensor); cublasdx::copy(b_global_tensor, b_shared_tensor); cublasdx::copy(c_global_tensor, c_shared_tensor); cublasdx::copy_wait(); // Execute GEMM GEMM().execute(alpha, a_shared_tensor, b_shared_tensor, beta, c_shared_tensor); __syncthreads(); // Store data from shared memory tensor to global memory tensor cublasdx::copy(c_shared_tensor, c_global_tensor); } template void tiled_gemm_example(complex alpha,const complex* A,const complex* B, complex beta, complex* C, cudaStream_t stream) { using GEMM = decltype(cublasdx::Size() + cublasdx::Precision() + cublasdx::Type() + cublasdx::Arrangement() + cublasdx::Function() + cublasdx::SM<860>() + cublasdx::Block() + cublasdx::BlockDim<512>()); dim3 grid_dim((80 + N_TILE - 1) / N_TILE, (80 + M_TILE - 1) / M_TILE); dim3 block_dim(GEMM::block_dim); gemm_kernel_shared<<(),stream>>>(alpha, A, B, beta, C,80,80,80); // cudaPeekAtLastError(); cudaStreamSynchronize(stream); } void dss(int i, cudaStream_t stream ,complex alpha,complex* A,complex* B,complex beta,complex* C) { // tiled_gemm_example<86><<<1, 1, 0, reinterpret_cast(stream)>>>( alpha,A ,B,beta, C, stream ); tiled_gemm_example<86>( alpha,A ,B,beta, C, stream ); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDA Kernel launch error: %s\n", cudaGetErrorString(err)); } } int main(int, char**) { complex a=complex {1.0, 1.0}; complex b={2.0, 2.0}; complex c={0.0, 0.0}; int N_Heri=1; thrust::device_vector A(size*size*N_Heri,a); thrust::device_vector B(size*size*N_Heri,b); thrust::device_vector C(size*size*N_Heri,c); complex alpha{1.0, 1.0}; complex beta{1.0, 1.0}; cudaStream_t streams[N_Heri]; for (int i=0;i