From 1ba4ce4ad792f9672eecc37bf982386d3a007914 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 6 Jun 2023 18:41:08 +0300 Subject: [PATCH] Revert "warp size fixes" It seems like 32 is faster for me, at least and it won't cause so many conflicts. This reverts commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d. --- ggml-cuda.cu | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3a5e1527fb5f7..8b2fc690e03e4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -182,11 +182,7 @@ typedef struct { } block_q6_k; static_assert(sizeof(block_q6_k) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_k block size/padding"); -#if defined(GGML_USE_HIPBLAS) -#define WARP_SIZE warpSize -#else #define WARP_SIZE 32 -#endif #define CUDA_MUL_BLOCK_SIZE 256 @@ -683,8 +679,8 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // sum up partial sums and write back result __syncthreads(); #pragma unroll - for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, WARP_SIZE); + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); } if (tid == 0) {