From f9446b8f4c3cb3a3c6d38734f9980712a82b9db9 Mon Sep 17 00:00:00 2001 From: Cory Bloor Date: Fri, 8 Jul 2022 20:53:32 -0600 Subject: [PATCH] Improve guards for arch-specific instructions (#368) When choosing between a specialized implementation that uses architecture-specific functionality and a generic fallback, it is usually preferable to make the fallback the default. This will give the software the best possible chance of functioning without modification on future hardware. Of course, the library will still need code updates to function optimally on hardware released after the software was written. rocSPARSE can also be compiled with CXXFLAGS=-DROCSPARSE_USE_MOVE_DPP=0 to force the use of the fallback implementation. Or with the value 1 to force the use of the specialized __hip_move_dpp implementation. This change fixes the compilation error: Illegal instruction detected: Invalid dpp_ctrl value: broadcasts are not supported on GFX10+ when building for unsupported Navi 1x and Navi 2x GPUs as was reported in https://github.com/ROCmSoftwarePlatform/rocSPARSE/issues/250 --- library/src/include/common.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/library/src/include/common.h b/library/src/include/common.h index 6a4654af..975c5f7d 100644 --- a/library/src/include/common.h +++ b/library/src/include/common.h @@ -34,6 +34,18 @@ // clang-format off +#ifndef ROCSPARSE_USE_MOVE_DPP +#if defined(__gfx803__) || \ + defined(__gfx900__) || \ + defined(__gfx906__) || \ + defined(__gfx908__) || \ + defined(__gfx90a__) +#define ROCSPARSE_USE_MOVE_DPP 1 +#else +#define ROCSPARSE_USE_MOVE_DPP 0 +#endif +#endif + // BSR indexing macros #define BSR_IND(j, bi, bj, dir) ((dir == rocsparse_direction_row) ? BSR_IND_R(j, bi, bj) : BSR_IND_C(j, bi, bj)) #define BSR_IND_R(j, bi, bj) (block_dim * block_dim * (j) + (bi) * block_dim + (bj)) @@ -233,7 +245,7 @@ __device__ __forceinline__ void rocsparse_blockreduce_min(int i, T* data) if(BLOCKSIZE > 1) { if(i < 1 && i + 1 < BLOCKSIZE) { data[i] = min(data[i], data[i + 1]); } __syncthreads(); } } -#ifndef __gfx1030__ +#if ROCSPARSE_USE_MOVE_DPP // DPP-based wavefront reduction maximum template __device__ __forceinline__ void rocsparse_wfreduce_max(int* maximum) @@ -499,7 +511,7 @@ __device__ __forceinline__ double rocsparse_wfreduce_sum(double sum) sum = temp_sum.val; return sum; } -#else +#else /* ROCSPARSE_USE_MOVE_DPP */ template __device__ __forceinline__ void rocsparse_wfreduce_max(int* maximum) { @@ -566,7 +578,7 @@ __device__ __forceinline__ double rocsparse_wfreduce_sum(double sum) return sum; } -#endif +#endif /* ROCSPARSE_USE_MOVE_DPP */ // DPP-based complex float wavefront reduction sum template