# --- T2-COPYRIGHT-BEGIN --- # t2/package/*/llama-cpp/opencl-amd.patch # Copyright (C) 2025 The T2 SDE Project # SPDX-License-Identifier: GPL-2.0 or patched project license # --- T2-COPYRIGHT-END --- --- llama.cpp.git/ggml/src/ggml-opencl/ggml-opencl.cpp.vanilla 2025-04-03 16:10:28.453521710 +0200 +++ llama.cpp.git/ggml/src/ggml-opencl/ggml-opencl.cpp 2025-04-03 16:19:30.697993755 +0200 @@ -54,6 +54,7 @@ enum GPU_FAMILY { ADRENO, INTEL, + AMD, UNKNOWN, }; @@ -564,6 +565,8 @@ backend_ctx->adreno_wave_size = 64; } else if (strstr(default_device->name, "Intel")) { backend_ctx->gpu_family = GPU_FAMILY::INTEL; + } else if (strstr(default_device->name, "AMD")) { + backend_ctx->gpu_family = GPU_FAMILY::AMD; } else { GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name); backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; @@ -3332,6 +3335,11 @@ nth1 = 1; kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat; + } else if (backend_ctx->gpu_family == AMD) { + nth0 = 16; + nth1 = 1; + + kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 1; @@ -3371,6 +3379,11 @@ global_work_size[0] = (size_t)(ne01 + 15)/16*nth0; global_work_size[1] = (size_t)ne11*nth1; global_work_size[2] = (size_t)ne12*ne13; + } else if (backend_ctx->gpu_family == AMD) { + // Set global size for AMD. It uses 16x output values. + global_work_size[0] = (size_t)(ne01 + 15)/16*nth0; + global_work_size[1] = (size_t)ne11*nth1; + global_work_size[2] = (size_t)ne12*ne13; } #ifdef GGML_OPENCL_PROFILING @@ -3400,6 +3413,9 @@ if (backend_ctx->gpu_family == INTEL) { nth0 = 32; nth1 = 1; + } else if (backend_ctx->gpu_family == AMD) { + nth0 = 32; + nth1 = 1; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 1; @@ -3437,6 +3453,9 @@ if (backend_ctx->gpu_family == INTEL) { nth0 = 32; nth1 = 1; + } else if (backend_ctx->gpu_family == AMD) { + nth0 = 32; + nth1 = 1; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 1; @@ -3496,6 +3515,12 @@ kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; ndst = 8; + } else if (backend_ctx->gpu_family == AMD) { + nth0 = 16; + nth1 = 1; + + kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; + ndst = 8; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 1; @@ -3532,6 +3557,16 @@ kernel = backend_ctx->kernel_mul_mat_q4_0_f32; ndst = 4; + } else if (backend_ctx->gpu_family == AMD) { + // Use 1D local size. Each workgroup is a SIMD group. Each SIMD + // group produces N_DST (4 for Q4_0 kernel) values in the result. + // The number of workgroups on dim 0 (the leading dimension) is + // the nearest multiple of 4 that covers ne0 (equals ne01). + nth0 = 16; + nth1 = 1; + + kernel = backend_ctx->kernel_mul_mat_q4_0_f32; + ndst = 4; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 64; nth1 = 1; @@ -3571,6 +3606,9 @@ if (backend_ctx->gpu_family == INTEL) { nth0 = 2; nth1 = 16; + } else if (backend_ctx->gpu_family == AMD) { + nth0 = 2; + nth1 = 16; } else if (backend_ctx->gpu_family == ADRENO) { nth0 = 2; nth1 = 64; @@ -3985,6 +3985,10 @@ // This is the same as the initial value. nth = MIN(32, ne00); } + else if (backend_ctx->gpu_family == AMD) { + // This is the same as the initial value. + nth = MIN(32, ne00); + } else if (backend_ctx->gpu_family == ADRENO) { nth = 64; } else {