diff --git a/.devops/tools.sh b/.devops/tools.sh index efdd6663c9664..2787c21fe6928 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -10,13 +10,13 @@ shift # Join the remaining arguments into a single string arg2="$@" -if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then - python3 ./convert.py $arg2 -elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then - ./quantize $arg2 -elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then - ./main $arg2 -elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then +if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then + python3 ./convert.py "$arg2" +elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then + ./quantize "$arg2" +elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then + ./main "$arg2" +elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do if [ -f "${i/f16/q4_0}" ]; then @@ -26,8 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then ./quantize "$i" "${i/f16/q4_0}" q4_0 fi done -elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then - ./server $arg2 +elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then + ./server "$arg2" else echo "Unknown command: $arg1" echo "Available commands: " diff --git a/Makefile b/Makefile index 039a75365d18e..4adaaaad87922 100644 --- a/Makefile +++ b/Makefile @@ -151,14 +151,11 @@ ifdef LLAMA_MPI CFLAGS += -DGGML_USE_MPI -Wno-cast-qual CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual OBJS += ggml-mpi.o - -ggml-mpi.o: ggml-mpi.c ggml-mpi.h - $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI ifdef LLAMA_OPENBLAS - CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas - LDFLAGS += -lopenblas + CFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas) + LDFLAGS += $(shell pkg-config --libs openblas) endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS @@ -247,9 +244,6 @@ ifdef LLAMA_METAL CXXFLAGS += -DGGML_USE_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders OBJS += ggml-metal.o - -ggml-metal.o: ggml-metal.m ggml-metal.h - $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_METAL ifneq ($(filter aarch64%,$(UNAME_M)),) @@ -274,6 +268,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif +ifdef LLAMA_METAL +ggml-metal.o: ggml-metal.m ggml-metal.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_METAL + +ifdef LLAMA_MPI +ggml-mpi.o: ggml-mpi.c ggml-mpi.h + $(CC) $(CFLAGS) -c $< -o $@ +endif # LLAMA_MPI + ifdef LLAMA_NO_K_QUANTS k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ diff --git a/README.md b/README.md index 476cc438baa2e..f45e4bf0849f0 100644 --- a/README.md +++ b/README.md @@ -640,7 +640,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t ```bash # run the verification script -python3 .\scripts\verify-checksum-models.py +./scripts/verify-checksum-models.py ``` - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: diff --git a/build.zig b/build.zig index 49c159ebf1e10..2287d2a2c442b 100644 --- a/build.zig +++ b/build.zig @@ -1,9 +1,19 @@ const std = @import("std"); +const commit_hash = @embedFile(".git/refs/heads/master"); -// Zig Version: 0.11.0-dev.3379+629f0d23b +// Zig Version: 0.11.0-dev.3986+e05c242cd pub fn build(b: *std.build.Builder) void { const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); + + const config_header = b.addConfigHeader( + .{ .style = .blank, .include_path = "build-info.h" }, + .{ + .BUILD_NUMBER = 0, + .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline + }, + ); + const lib = b.addStaticLibrary(.{ .name = "llama", .target = target, @@ -13,24 +23,21 @@ pub fn build(b: *std.build.Builder) void { lib.linkLibCpp(); lib.addIncludePath("."); lib.addIncludePath("./examples"); - lib.addCSourceFiles(&.{ - "ggml.c", - }, &.{"-std=c11"}); - lib.addCSourceFiles(&.{ - "llama.cpp", - }, &.{"-std=c++11"}); + lib.addConfigHeader(config_header); + lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"}); + lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"}); b.installArtifact(lib); const examples = .{ "main", "baby-llama", "embedding", - // "metal", + "metal", "perplexity", "quantize", "quantize-stats", "save-load-state", - // "server", + "server", "simple", "train-text-from-scratch", }; @@ -43,16 +50,19 @@ pub fn build(b: *std.build.Builder) void { }); exe.addIncludePath("."); exe.addIncludePath("./examples"); + exe.addConfigHeader(config_header); exe.addCSourceFiles(&.{ - std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}), + std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }), "examples/common.cpp", }, &.{"-std=c++11"}); exe.linkLibrary(lib); b.installArtifact(exe); + const run_cmd = b.addRunArtifact(exe); run_cmd.step.dependOn(b.getInstallStep()); if (b.args) |args| run_cmd.addArgs(args); - const run_step = b.step("run_" ++ example_name, "Run the app"); + + const run_step = b.step("run-" ++ example_name, "Run the app"); run_step.dependOn(&run_cmd.step); } } diff --git a/examples/common.cpp b/examples/common.cpp index fd551c9cb2fcf..8705127cb11dc 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -168,6 +168,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -285,6 +297,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.lora_adapter = argv[i]; + params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { invalid_param = true; @@ -492,6 +505,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -520,7 +535,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stderr, " --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); @@ -572,6 +587,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.use_mlock = params.use_mlock; lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; + lparams.rope_freq_base = params.rope_freq_base; + lparams.rope_freq_scale = params.rope_freq_scale; return lparams; } diff --git a/examples/common.h b/examples/common.h index 6315df9613445..f52fef6292bf2 100644 --- a/examples/common.h +++ b/examples/common.h @@ -32,6 +32,8 @@ struct gpt_params { int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + float rope_freq_base = 10000.0f; // RoPE base frequency + float rope_freq_scale = 1.0f; // RoPE frequency scaling factor // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/embd-input/README.md b/examples/embd-input/README.md index 02d028f261f17..5c4c75ea77cf9 100644 --- a/examples/embd-input/README.md +++ b/examples/embd-input/README.md @@ -17,7 +17,7 @@ make import torch bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin" -pth_path = "./examples/embd_input/llava_projection.pth" +pth_path = "./examples/embd-input/llava_projection.pth" dic = torch.load(bin_path) used_key = ["model.mm_projector.weight","model.mm_projector.bias"] diff --git a/examples/embd-input/llava.py b/examples/embd-input/llava.py index 2f20cb7225b20..bcbdd2bedfd1a 100644 --- a/examples/embd-input/llava.py +++ b/examples/embd-input/llava.py @@ -59,7 +59,7 @@ def chat_with_image(self, image, question): # Also here can use pytorch_model-00003-of-00003.bin directly. a.load_projection(os.path.join( os.path.dirname(__file__) , - "llava_projetion.pth")) + "llava_projection.pth")) respose = a.chat_with_image( Image.open("./media/llama1-logo.png").convert('RGB'), "what is the text in the picture?") diff --git a/examples/main/README.md b/examples/main/README.md index 04b8d5404d5cc..37538613042b0 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 2248c245875b0..bcbcf12b0ac19 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -84,9 +84,17 @@ int main(int argc, char ** argv) { return 0; } + if (params.rope_freq_base != 10000.0) { + fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 1.0) { + fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); + } + if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" - "expect poor results\n", __func__, params.n_ctx); + fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" + " you are on your own\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; diff --git a/examples/server/README.md b/examples/server/README.md index 3691abd7457de..e5ca8269b9d56 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -16,7 +16,7 @@ Command line options: - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. @@ -66,6 +66,7 @@ Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the ```sh curl --request POST \ --url http://localhost:8080/completion \ + --header "Content-Type: application/json" \ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' ``` diff --git a/examples/server/chat.sh b/examples/server/chat.sh index a89f8e908c403..0143601214b15 100644 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -32,6 +32,7 @@ tokenize() { --silent \ --request POST \ --url "${API_URL}/tokenize" \ + --header "Content-Type: application/json" \ --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ | jq '.tokens[]' } @@ -64,6 +65,7 @@ chat_completion() { --no-buffer \ --request POST \ --url "${API_URL}/completion" \ + --header "Content-Type: application/json" \ --data-raw "${DATA}") printf "\n" diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4114343ff728a..f442f2b56d368 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -608,6 +608,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -632,7 +634,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter\n"); + fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); @@ -722,6 +724,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } + else if (arg == "--rope-freq-base") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } + else if (arg == "--rope-freq-scale") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; @@ -820,6 +838,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } params.lora_adapter = argv[i]; + params.use_mmap = false; } else if (arg == "--lora-base") { diff --git a/flake.nix b/flake.nix index cebb47b94c92e..dda7ee7b76ade 100644 --- a/flake.nix +++ b/flake.nix @@ -43,6 +43,8 @@ "-DLLAMA_METAL=ON" ]); installPhase = '' + runHook preInstall + mkdir -p $out/bin mv bin/* $out/bin/ mv $out/bin/main $out/bin/llama @@ -51,6 +53,8 @@ echo "#!${llama-python}/bin/python" > $out/bin/convert.py cat ${./convert.py} >> $out/bin/convert.py chmod +x $out/bin/convert.py + + runHook postInstall ''; meta.mainProgram = "llama"; }; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 436ba870f8870..f6426d4bad168 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -69,6 +69,8 @@ #include "ggml-cuda.h" #include "ggml.h" +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -130,7 +132,7 @@ typedef void (*ggml_cuda_op_t)( #define QK4_0 32 #define QR4_0 2 -#define QI4_0 4 +#define QI4_0 (QK4_0 / (4 * QR4_0)) typedef struct { half d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants @@ -139,7 +141,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 #define QK4_1 32 #define QR4_1 2 -#define QI4_1 4 +#define QI4_1 (QK4_1 / (4 * QR4_1)) typedef struct { half d; // delta half m; // min @@ -149,7 +151,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong #define QK5_0 32 #define QR5_0 2 -#define QI5_0 4 +#define QI5_0 (QK5_0 / (4 * QR5_0)) typedef struct { half d; // delta uint8_t qh[4]; // 5-th bit of quants @@ -159,7 +161,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5 #define QK5_1 32 #define QR5_1 2 -#define QI5_1 4 +#define QI5_1 (QK5_1 / (4 * QR5_1)) typedef struct { half d; // delta half m; // min @@ -170,7 +172,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + #define QK8_0 32 #define QR8_0 1 -#define QI8_0 8 +#define QI8_0 (QK8_0 / (4 * QR8_0)) typedef struct { half d; // delta int8_t qs[QK8_0]; // quants @@ -179,7 +181,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo #define QK8_1 32 #define QR8_1 1 -#define QI8_1 8 +#define QI8_1 (QK8_1 / (4 * QR8_1)) typedef struct { half d; // delta half s; // unquantized sum @@ -199,6 +201,8 @@ typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_ #define K_SCALE_SIZE 12 #endif +#define QR2_K 4 +#define QI2_K (QK_K / (4*QR2_K)) typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants @@ -207,6 +211,8 @@ typedef struct { } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); +#define QR3_K 4 +#define QI3_K (QK_K / (4*QR3_K)) typedef struct { uint8_t hmask[QK_K/8]; // quants - high bit uint8_t qs[QK_K/4]; // quants - low 2 bits @@ -219,6 +225,8 @@ typedef struct { } block_q3_K; //static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding"); +#define QR4_K 2 +#define QI4_K (QK_K / (4*QR4_K)) #ifdef GGML_QKK_64 typedef struct { half d[2]; // super-block scales/mins @@ -236,6 +244,8 @@ typedef struct { static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding"); #endif +#define QR5_K 2 +#define QI5_K (QK_K / (4*QR5_K)) #ifdef GGML_QKK_64 typedef struct { half d; // super-block scale @@ -255,6 +265,8 @@ typedef struct { static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); #endif +#define QR6_K 2 +#define QI6_K (QK_K / (4*QR6_K)) typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits @@ -296,13 +308,13 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; -static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) { +static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { + if (i >= kx) { return; } - dst[i] = x[i] + y[i]; + dst[i] = x[i] + y[i%ky]; } static __global__ void add_f16_f32_f16(const half * x, const float * y, half * dst, const int k) { @@ -323,10 +335,9 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co dst[i] = x[i] * y[i%ky]; } -static const float GELU_COEF_A = 0.044715f; -static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - static __global__ void gelu_f32(const float * x, float * dst, const int k) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -1328,8 +1339,9 @@ static __global__ void dequantize_block(const void * __restrict__ vx, float * __ y[iybs + iqs + y_offset] = v.y; } -static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; int vi; @@ -1350,11 +1362,12 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]); @@ -1375,11 +1388,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; int qs; @@ -1410,11 +1424,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q5_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]); @@ -1444,11 +1459,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restric return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } -static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { -#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics +static __device__ __forceinline__ float vec_dot_q8_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; int vi; @@ -1463,7 +1479,220 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restric return sumi*d; #else return 0.0f; // only to satisfy the compiler -#endif // __CUDA_ARCH__ >= 610 +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq2_K->d; + const float dmin = bq2_K->dmin; + + const int v = *((int *) &bq2_K->qs[sizeof(int) * iqs]); + + for (int i = 0; i < QR2_K; ++i) { + const int sc = bq2_K->scales[scale_offset + 2*i]; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const float d8i = bq8i->d; + + const int vi = (v >> (2*i)) & 0x03030303; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + + sumf_d += d8i * (__dp4a(vi, ui, 0) * (sc & 0xF)); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * (sc >> 4)); // multiply constant q2_K part with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + float sumf = 0.0f; + + const float d = bq3_K->d; + + int vl; + memcpy(&vl, &bq3_K->qs[sizeof(int) * iqs], sizeof(int)); + + int vh; + memcpy(&vh, &bq3_K->hmask[sizeof(int) * (iqs % (QI3_K/2))], sizeof(int)); + vh = ~vh; // invert the mask so that a 0/1 results in 4/0 being subtracted + vh >>= bq8_offset; + + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (bq3_K->scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((bq3_K->scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + const int bq8_offset = QR4_K * (iqs / QI8_1); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq4_K->d; + const float dmin = bq4_K->dmin; + + const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); + + for (int i = 0; i < QR4_K; ++i) { + const int isc = bq8_offset + i; + + uint8_t sc, m; + get_scale_min_k4(isc, bq4_K->scales, sc, m); + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vi = (v >> (4*i)) & 0x0F0F0F0F; + + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int bq8_offset = QR5_K * (iqs / QI8_1); + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + const float d = bq5_K->d; + const float dmin = bq5_K->dmin; + + const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]); + + const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset; + + for (int i = 0; i < QR5_K; ++i) { + const int isc = bq8_offset + i; + + uint8_t sc, m; + get_scale_min_k4(isc, bq5_K->scales, sc, m); + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); + const float d8i = bq8i->d; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> i) << 4) & 0x10101010; + + const int vi = vil | vih; + + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values + } + + return d*sumf_d - dmin*sumf_m; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + float sumf = 0.0f; + + const float d = bq6_K->d; + + int vl; + memcpy(&vl, &bq6_K->ql[sizeof(int) * iqs], sizeof(int)); + + int vh; + memcpy(&vh, &bq6_K->qh[sizeof(int) * ((QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4))], sizeof(int)); + + for (int i = 0; i < QR6_K; ++i) { + const int sc = bq6_K->scales[scale_offset + 4*i]; + + const block_q8_1 * bq8i = bq8_1 + bq8_offset + 2*i; + const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % (QI8_1))]); + const float d8i = bq8i->d; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (vh_shift + 4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A } template @@ -1486,7 +1715,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * for (int i = 0; i < blocks_per_row; i += blocks_per_warp) { const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index - const int iby = i + threadIdx.x / qi; // y block index + const int iby = (i + threadIdx.x / qi) * qk/QK8_1; // y block index that aligns with ibx const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int @@ -1724,6 +1953,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c dst[i + 1] = x0*sin_theta + x1*cos_theta; } +static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const float col_theta_scale = powf(theta_scale, col); + + const float theta = p*col_theta_scale; + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = block_p*col_theta_scale; + const float sin_block_theta = sinf(block_theta); + const float cos_block_theta = cosf(block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int row = blockDim.y*blockIdx.y + threadIdx.y; @@ -1789,9 +2052,9 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale dst[i] = scale * x[i]; } -static void add_f32_cuda(const float * x, const float * y, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; - add_f32<<>>(x, y, dst, k); +static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { + const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; + add_f32<<>>(x, y, dst, kx, ky); } static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, const int k, cudaStream_t stream) { @@ -1985,7 +2248,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f } static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK4_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -1994,7 +2257,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK4_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -2003,7 +2266,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK5_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -2012,7 +2275,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK5_1 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -2021,7 +2284,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * } static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + GGML_ASSERT(ncols % QK8_0 == 0); const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); @@ -2029,6 +2292,51 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * <<>>(vx, vy, dst, ncols, nrows); } +static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + +static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; dequantize_block<1, 1, convert_f16><<>>(vx, y, k); @@ -2121,6 +2429,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i rope_f32<<>>(x, dst, ncols, p, theta_scale); } +static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { + GGML_ASSERT(nrows % 4 == 0); + const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + rope_glm_f32<<>>(x, dst, ncols, p, block_p, theta_scale); +} + static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1); const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; @@ -2350,17 +2666,15 @@ inline void ggml_cuda_op_add( GGML_ASSERT(src1_ddf_i != nullptr); GGML_ASSERT(dst_ddf_i != nullptr); - // TODO: support broadcasting - GGML_ASSERT(ggml_nelements(src0) == ggml_nelements(src1)); - const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; // compute if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main); + add_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { add_f16_f32_f16_cuda((half *) src0_ddq_i, src1_ddf_i, (half *) dst_ddf_i, ne00*i01_diff, cudaStream_main); } else { @@ -2384,19 +2698,12 @@ inline void ggml_cuda_op_mul( GGML_ASSERT(dst_ddf_i != nullptr); const int64_t ne00 = src0->ne[0]; + const int64_t i01_diff = i01_high - i01_low; + const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - for (int64_t i01 = i01_low; i01 < i01_high; i01++) { - const int64_t i11 = i1*ne11 + i01%ne11; // broadcast src1 across src0 - - float * src0_ddf_i01 = src0_ddf_i + i01*ne00; - float * src1_ddf_i01 = src1_ddf_i + i11*ne10; - float * dst_ddf_i01 = dst_ddf_i + i01*ne00; - - // compute - mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main); - } + mul_f32_cuda(src0_ddf_i, src1_ddf_i, dst_ddf_i, ne00*i01_diff, ne10*ne11, cudaStream_main); (void) dst; (void) src0_ddq_i; @@ -2509,13 +2816,22 @@ inline void ggml_cuda_op_mul_mat_vec( int id; CUDA_CHECK(cudaGetDevice(&id)); - const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 || + bool mul_mat_vec_q_implemented = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0; - - const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented; +#if QK_K == 256 + mul_mat_vec_q_implemented = mul_mat_vec_q_implemented || + src0->type == GGML_TYPE_Q2_K || + src0->type == GGML_TYPE_Q3_K || + src0->type == GGML_TYPE_Q4_K || + src0->type == GGML_TYPE_Q5_K || + src0->type == GGML_TYPE_Q6_K; +#endif // QK_K == 256 + + const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= MIN_CC_DP4A && mul_mat_vec_q_implemented; #endif if (use_mul_mat_vec_q) { @@ -2541,6 +2857,21 @@ inline void ggml_cuda_op_mul_mat_vec( case GGML_TYPE_Q8_0: mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main); + break; default: GGML_ASSERT(false); break; @@ -2675,13 +3006,21 @@ inline void ggml_cuda_op_rope( const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; - GGML_ASSERT(mode == 0); + const int n_ctx = ((int32_t *) src1->data)[3]; const float theta_scale = powf(10000.0, -2.0f/n_dims); const float p = ((mode & 1) == 0 ? n_past + i02 : i02); + bool is_glm = mode & 4; + // compute - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + if (is_glm) { + const float id_p = min(p, n_ctx - 2.f); + const float block_p = max(p - (n_ctx - 2.f), 0.f); + rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); + } else { + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); + } (void) dst; (void) src0_ddq_i; @@ -3254,6 +3593,11 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens (void) dst; } +void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_cpy(src0, dst, nullptr); + (void) src1; +} + void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true); @@ -3363,6 +3707,22 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } +static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static size_t g_temp_tensor_extra_index = 0; + +static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (g_temp_tensor_extras == nullptr) { + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + } + + size_t alloc_index = g_temp_tensor_extra_index; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; + struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; +} + void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace) { if (scratch && g_scratch_size == 0) { return; @@ -3371,7 +3731,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo // recursively assign CUDA buffers until a compute tensor is found if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) { + if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace); } } @@ -3380,8 +3740,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo } tensor->backend = GGML_BACKEND_GPU; - struct ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); + struct ggml_tensor_extra_gpu * extra; const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || @@ -3396,10 +3755,12 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo if (tensor->op == GGML_OP_VIEW) { memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); } + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; void * src1_ddv = src1_extra->data_device[g_main_device]; + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src1_ddv; } else if (scratch) { GGML_ASSERT(size <= g_scratch_size); @@ -3412,6 +3773,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); g_scratch_buffer = data; } + extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = data + g_scratch_offset; g_scratch_offset += size; @@ -3421,6 +3783,8 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo void * data; CUDA_CHECK(cudaMalloc(&data, size)); CUDA_CHECK(cudaMemset(data, 0, size)); + extra = new ggml_tensor_extra_gpu; + memset(extra, 0, sizeof(*extra)); extra->data_device[g_main_device] = data; } @@ -3473,6 +3837,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); switch (tensor->op) { + case GGML_OP_DUP: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_ADD: if (!any_on_device) { return false; @@ -3527,6 +3897,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ } func = ggml_cuda_cpy; break; + case GGML_OP_CONT: + if (!any_on_device) { + return false; + } + func = ggml_cuda_dup; + break; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: diff --git a/ggml-metal.m b/ggml-metal.m index 02dc9beb94c58..ee205bcdf773c 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -739,12 +739,8 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - if (src0t == GGML_TYPE_Q4_0) { - [encoder dispatchThreadgroups:MTLSizeMake(ne01 / 8+((ne01 % 8) & 0x01), ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src0t == GGML_TYPE_Q4_1) { - [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q3_K || @@ -885,28 +881,35 @@ void ggml_metal_graph_compute( const int n_past = ((int32_t *)(src1->data))[0]; + float freq_base; + float freq_scale; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 30d60fa58d686..9f9a4fbd74446 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -395,9 +395,12 @@ kernel void kernel_mul_mat_q4_0_f32( // each thread in a SIMD group deals with 1 block. for (int column = 0; column < nb / N_SIMDWIDTH; column++) { + float sumy = 0; for (int i = 0; i < QK4_0 / 4; i++) { y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } + sumy *= (-8.f); for (int row = 0; row < N_DST; row++) { // prefetch next x block @@ -405,39 +408,50 @@ kernel void kernel_mul_mat_q4_0_f32( // calculate float d = qb_curr.d; - float2 acc = {0.0f, 0.0f}; + float acc = sumy; for (int i = 0; i < 16; i++) { - acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - acc[1] += yl[i] + yl[i+16]; + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); } - sumf[row] += d * (acc[0] - 8.f*acc[1]); + sumf[row] += d * acc; qb_curr = qb_next; } } - for (int i = 0; i < QK4_0 / 4; i++) { - y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); - } - - for (int row = 0; row < N_DST; row++) { - // prefetch next x block - qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - - // calculate - float d = qb_curr.d; - float2 acc = {0.0f, 0.0f}; - for (int i = 0; i < 16; i++) { - acc[0] += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); - acc[1] += yl[i] + yl[i+16]; + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } } - if (tiisg < nb % N_SIMDWIDTH) { - sumf[row] += d * (acc[0] - 8.f*acc[1]); + } else { + + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } - qb_curr = qb_next; + sumy *= (-8.f); + + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; + + // calculate + float d = qb_curr.d; + float acc = sumy; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc; + } + qb_curr = qb_next; - all_sum = simd_sum(sumf[row]); - if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { - dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } } } } @@ -449,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32( constant int64_t & ne00, constant int64_t & ne10, constant int64_t & ne0, - threadgroup float * sum [[threadgroup(0)]], + constant int64_t & ne01[[buffer(4)]], uint2 tgpig[[threadgroup_position_in_grid]], - uint2 tpitg[[thread_position_in_threadgroup]], - uint2 tptg[[threads_per_threadgroup]]) { - const int nb = ne00/QK4_1; - - const int64_t r0 = tgpig.x; - const int64_t r1 = tgpig.y; - - device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb; + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + const int nb = ne00/QK4_0; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb; device const float * y = (device const float *) src1 + r1*ne10; + block_q4_1 qb_curr, qb_next; + float4 y_curr[8]; // src1 vector cache + float sumf[N_DST]={0.f}, all_sum; + thread float * yl=(thread float *)y_curr; - const uint nth = tptg.x*tptg.y; - const uint ith = tptg.y*tpitg.x + tpitg.y; - - const int ix = tpitg.y/4; // 0 or 1 - const int iy = tpitg.y - 4*ix; // 0...3 - - const int first = 4 * iy; - - float sumf = 0; - - for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) { - - const float d = (float)x[i].d; - const float m = (float)x[i].m; + // bootstrap + qb_curr = x[tiisg]; + // each thread in a SIMD group deals with 1 block. + for (int column = 0; column < nb / N_SIMDWIDTH; column++) { - device const uint8_t * xl = x[i].qs + first; - device const float * yl = y + i * QK4_1 + first; + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; + } - float2 acc = {0.0f, 0.0f}; + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - for (int j = 0; j < 4; ++j) { + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + sumf[row] += d * acc + m * sumy; + qb_curr = qb_next; + } + } - acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m); - acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m); + if (nb % N_SIMDWIDTH == 0) { + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } + } else { + float sumy = 0; + for (int i = 0; i < QK4_0 / 4; i++) { + y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i)); + sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3]; } - sumf += acc[0] + acc[1]; - } + for (int row = 0; row < N_DST; row++) { + // prefetch next x block + qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH]; - sum[ith] = sumf; + // calculate + const float d = qb_curr.d; + const float m = qb_curr.m; + float acc = 0.f; + for (int i = 0; i < 16; i++) { + acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4); + } + if (tiisg < nb % N_SIMDWIDTH) { + sumf[row] += d * acc + m * sumy; + } + qb_curr = qb_next; - // - // Accumulate the sum from all threads in the threadgroup - // - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%4 == 0) { - sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith%16 == 0) { - sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); - if (ith == 0) { - for (uint i = 16; i < nth; i += 16) sum[0] += sum[i]; - dst[r1*ne0 + r0] = sum[0]; + all_sum = simd_sum(sumf[row]); + if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) { + dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum; + } + } } } @@ -624,17 +656,19 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant float & freq_base, + constant float & freq_scale, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; const int64_t i1 = tpig[0]; const bool is_neox = mode & 2; - const float theta_scale = pow(10000.0, -2.0f/n_dims); + const float theta_scale = pow(freq_base, -2.0f/n_dims); const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = (float)p; + float theta = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { diff --git a/ggml.c b/ggml.c index c137ae658df7f..c56a3d0e0c0a2 100644 --- a/ggml.c +++ b/ggml.c @@ -31,11 +31,17 @@ #include #endif +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#endif #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts @@ -112,10 +118,6 @@ typedef void * thread_ret_t; #endif #endif -#ifdef __HAIKU__ -#define static_assert(cond, msg) _Static_assert(cond, msg) -#endif - /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -4410,8 +4412,8 @@ void ggml_free(struct ggml_context * ctx) { if (&g_state.contexts[i].context == ctx) { g_state.contexts[i].used = false; - GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", - __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); + GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", + __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { GGML_ALIGNED_FREE(ctx->mem_buffer); @@ -6954,6 +6956,8 @@ struct ggml_tensor * ggml_rope_impl( int n_past, int n_dims, int mode, + float freq_base, + float freq_scale, int n_ctx, bool inplace) { GGML_ASSERT(n_past >= 0); @@ -6967,12 +6971,14 @@ struct ggml_tensor * ggml_rope_impl( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); ((int32_t *) b->data)[0] = n_past; ((int32_t *) b->data)[1] = n_dims; ((int32_t *) b->data)[2] = mode; ((int32_t *) b->data)[3] = n_ctx; + memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float)); + memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float)); ggml_scratch_load(ctx); @@ -6991,7 +6997,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, false); } struct ggml_tensor * ggml_rope_inplace( @@ -7001,7 +7007,19 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 10000.0f, 1.0f, n_ctx, true); +} + +struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, freq_base, freq_scale, n_ctx, true); } // ggml_rope_back @@ -10684,6 +10702,8 @@ static void ggml_compute_forward_mul_mat( const enum ggml_type type = src0->type; + const bool src1_cont = ggml_is_contiguous(src1); + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; @@ -10747,7 +10767,7 @@ static void ggml_compute_forward_mul_mat( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); if (type != GGML_TYPE_F32) { - float * const wdata = params->wdata; + float * const wdata = params->wdata; ggml_to_float_t const to_float = type_traits[type].to_float; size_t id = 0; @@ -10805,7 +10825,7 @@ static void ggml_compute_forward_mul_mat( // src1 rows const int64_t nr1 = ne11*ne12*ne13; - void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; for (int64_t ir1 = 0; ir1 < nr1; ++ir1) { @@ -10828,7 +10848,15 @@ static void ggml_compute_forward_mul_mat( const int64_t i3 = i13; const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 ); - const char * src1_col = (const char *) wdata + (i11 + i12*ne11 + i13*ne12*ne11)*row_size; + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); @@ -12062,16 +12090,21 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); @@ -12100,7 +12133,7 @@ static void ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12112,7 +12145,7 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12189,16 +12222,21 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src1, struct ggml_tensor * dst) { GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 4); + GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } + float freq_base; + float freq_scale; + const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; const int n_ctx = ((int32_t *) src1->data)[3]; + memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); assert(n_past >= 0); @@ -12227,7 +12265,7 @@ static void ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12239,7 +12277,7 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12300,7 +12338,7 @@ static void ggml_compute_forward_rope_f16( const float x0 = GGML_FP16_TO_FP32(src[0]); const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -12982,12 +13020,13 @@ static void ggml_compute_forward_conv_1d( }; } -// ggml_compute_forward_conv_2d_sk_p0 +// ggml_compute_forward_conv_2d -static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( +static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13007,28 +13046,37 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; + const int32_t s0 = ((const int32_t*)(opt0->data))[0]; + const int32_t s1 = ((const int32_t*)(opt0->data))[1]; + const int32_t p0 = ((const int32_t*)(opt0->data))[2]; + const int32_t p1 = ((const int32_t*)(opt0->data))[3]; + const int32_t d0 = ((const int32_t*)(opt0->data))[4]; + const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); if (params->type == GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare source data (src1) { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); } } } @@ -13071,19 +13119,21 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( } } -static void ggml_compute_forward_conv_2d_sk_p0( +static void ggml_compute_forward_conv_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + const struct ggml_tensor * opt0, + struct ggml_tensor * dst + ) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); } break; case GGML_TYPE_F32: { - //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst); + //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); GGML_ASSERT(false); } break; default: @@ -13093,32 +13143,6 @@ static void ggml_compute_forward_conv_2d_sk_p0( } } -// ggml_compute_forward_conv_2d - -static void ggml_compute_forward_conv_2d( - const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* src1, - const struct ggml_tensor* opt0, - struct ggml_tensor* dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t s1 = ((const int32_t*)(opt0->data))[1]; - const int32_t p0 = ((const int32_t*)(opt0->data))[2]; - const int32_t p1 = ((const int32_t*)(opt0->data))[3]; - const int32_t d0 = ((const int32_t*)(opt0->data))[4]; - const int32_t d1 = ((const int32_t*)(opt0->data))[5]; - GGML_ASSERT(d0 == 1); // dilation not supported - GGML_ASSERT(d1 == 1); - GGML_ASSERT(p0 == 0); // padding not supported - GGML_ASSERT(p1 == 0); - - if (s0 == src0->ne[0] && s1 == src0->ne[1]) { - ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); - } else { - GGML_ASSERT(false); // only stride equal to kernel size is supported - } -} - // ggml_compute_forward_pool_1d_sk_p0 static void ggml_compute_forward_pool_1d_sk_p0( @@ -15712,7 +15736,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // necessary for llama if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); + assert(ggml_nelements(src1) == 6); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -15733,7 +15757,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { if (src0->grad) { assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); + assert(ggml_nelements(src1) == 3); const int n_past = ((int32_t *) src1->data)[0]; const int n_dims = ((int32_t *) src1->data)[1]; const int mode = ((int32_t *) src1->data)[2]; @@ -16293,8 +16317,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + ggml_graph_compute_perf_stats_node(node, state->shared); } // distribute new work or execute it direct if 1T @@ -16324,8 +16348,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (GGML_OP_HAS_FINALIZE[node->op]) { params.type = GGML_TASK_FINALIZE; ggml_compute_forward(¶ms, node); - ggml_graph_compute_perf_stats_node(node, state->shared); } + + ggml_graph_compute_perf_stats_node(node, state->shared); } else { break; } @@ -16575,19 +16600,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne11 = node->src[1]->ne[1]; // H const int64_t ne12 = node->src[1]->ne[2]; // C + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; - UNUSED(ne02); UNUSED(ne03); - UNUSED(nk); + UNUSED(ne2); size_t cur = 0; if (node->src[0]->type == GGML_TYPE_F16 && - node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { + node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); } else { GGML_ASSERT(false); @@ -16864,9 +16892,6 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - //assert(cgraph->work == NULL); - //assert(cgraph->work_size == 0); - uint64_t size_eval = 0; // compute size of intermediate results @@ -17305,9 +17330,6 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT("=== GRAPH ===\n"); - GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); - GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; diff --git a/ggml.h b/ggml.h index b88c35bae7267..24856a255c9a6 100644 --- a/ggml.h +++ b/ggml.h @@ -1121,6 +1121,17 @@ extern "C" { int mode, int n_ctx); + // custom RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float freq_base, + float freq_scale, + int n_ctx); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( diff --git a/k_quants.h b/k_quants.h index 6abe3d7b8f422..adc6a391376d4 100644 --- a/k_quants.h +++ b/k_quants.h @@ -15,6 +15,14 @@ #define K_SCALE_SIZE 12 #endif +#ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else +#define static_assert(cond, msg) struct global_scope_noop_trick +#endif +#endif + // // Super-block quantization structures // diff --git a/llama-util.h b/llama-util.h index 43b6f05adeb71..042ebe43c48e1 100644 --- a/llama-util.h +++ b/llama-util.h @@ -175,13 +175,13 @@ struct llama_mmap { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { size = file->size; int fd = fileno(file->fp); - int flags = MAP_PRIVATE; + int flags = MAP_SHARED; // prefetch/readahead impairs performance on NUMA systems if (numa) { prefetch = 0; } #ifdef __linux__ if (prefetch) { flags |= MAP_POPULATE; } #endif - addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0); + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { throw std::runtime_error(format("mmap failed: %s", strerror(errno))); } @@ -223,7 +223,7 @@ struct llama_mmap { throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str())); } - addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0); + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); error = GetLastError(); CloseHandle(hMapping); diff --git a/llama.cpp b/llama.cpp index 2d09d6ce76619..0f9d5346dc551 100644 --- a/llama.cpp +++ b/llama.cpp @@ -101,14 +101,15 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * // memory sizes // -static const std::map & MEM_REQ_SCRATCH0() +static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { - { MODEL_3B, 256ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + /* empirical scaling, still a guess */ + { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB }, }; return k_sizes; } @@ -140,14 +141,14 @@ static const std::map & MEM_REQ_KV_SELF() // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL() +static const std::map & MEM_REQ_EVAL(int n_ctx) { static std::map k_sizes = { - { MODEL_3B, 512ull * MB }, - { MODEL_7B, 768ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, + { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB }, }; return k_sizes; } @@ -189,6 +190,10 @@ struct llama_hparams { uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { @@ -303,7 +308,7 @@ struct llama_model { }; struct llama_context { - llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} #ifdef GGML_USE_METAL ~llama_context() { if (ctx_metal) { @@ -324,7 +329,6 @@ struct llama_context { int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) const llama_model & model; - const llama_vocab & vocab; bool model_owner = false; @@ -648,7 +652,7 @@ struct llama_model_loader { *ctx_size_p = *mmapped_size_p = 0; for (const llama_load_tensor & lt : tensors_map.tensors) { *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16; } } @@ -844,6 +848,8 @@ struct llama_context_params llama_context_default_params() { /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.rope_freq_base =*/ 10000.0f, + /*.rope_freq_scale =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, @@ -967,6 +973,8 @@ static void llama_model_load_internal( int n_gpu_layers, int main_gpu, const float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1001,22 +1009,27 @@ static void llama_model_load_internal( } hparams.n_ctx = n_ctx; + + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; } const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -1165,9 +1178,9 @@ static void llama_model_load_internal( const size_t mem_required = ctx_size + mmapped_size - vram_weights + // weights in VRAM not in memory - MEM_REQ_SCRATCH0().at(model.type) + + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at (model.type); + MEM_REQ_EVAL(hparams.n_ctx).at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = @@ -1271,6 +1284,8 @@ static bool llama_model_load( int n_gpu_layers, int main_gpu, float * tensor_split, + float rope_freq_base, + float rope_freq_scale, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1279,7 +1294,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1331,6 +1346,9 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; const int n_gpu_layers = model.n_gpu_layers; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1428,11 +1446,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -2187,7 +2205,7 @@ void llama_sample_classifier_free_guidance( struct llama_context * guidance_ctx, float scale, float smooth_factor) { - int64_t t_start_sample_us = t_start_sample_us = ggml_time_us(); + int64_t t_start_sample_us = ggml_time_us(); assert(ctx); auto n_vocab = llama_n_vocab(ctx); @@ -2675,8 +2693,9 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, + memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, + params.progress_callback_user_data)) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; @@ -2697,7 +2716,7 @@ struct llama_context * llama_new_context_with_model( return nullptr; } - llama_context * ctx = new llama_context(*model, model->vocab); + llama_context * ctx = new llama_context(*model); if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); @@ -2751,9 +2770,9 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); } @@ -3535,13 +3554,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) { return 0; } -int llama_tokenize( - struct llama_context * ctx, +int llama_tokenize_with_model( + const struct llama_model * model, const char * text, llama_token * tokens, int n_max_tokens, bool add_bos) { - auto res = llama_tokenize(ctx->vocab, text, add_bos); + auto res = llama_tokenize(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { fprintf(stderr, "%s: too many tokens\n", __func__); @@ -3555,8 +3574,29 @@ int llama_tokenize( return res.size(); } +int llama_tokenize( + struct llama_context * ctx, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos) { + return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos); +} + +int llama_n_vocab_from_model(const struct llama_model * model) { + return model->vocab.id_to_token.size(); +} + +int llama_n_ctx_from_model(const struct llama_model * model) { + return model->hparams.n_ctx; +} + +int llama_n_embd_from_model(const struct llama_model * model) { + return model->hparams.n_embd; +} + int llama_n_vocab(const struct llama_context * ctx) { - return ctx->vocab.id_to_token.size(); + return ctx->model.vocab.id_to_token.size(); } int llama_n_ctx(const struct llama_context * ctx) { @@ -3567,19 +3607,27 @@ int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } -int llama_get_vocab( - const struct llama_context * ctx, +int llama_get_vocab_from_model( + const struct llama_model * model, const char * * strings, float * scores, int capacity) { - int n = std::min(capacity, (int) ctx->vocab.id_to_token.size()); + int n = std::min(capacity, (int) model->vocab.id_to_token.size()); for (int i = 0; ivocab.id_to_token[i].tok.c_str(); - scores[i] = ctx->vocab.id_to_token[i].score; + strings[i] = model->vocab.id_to_token[i].tok.c_str(); + scores[i] = model->vocab.id_to_token[i].score; } return n; } +int llama_get_vocab( + const struct llama_context * ctx, + const char * * strings, + float * scores, + int capacity) { + return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity); +} + float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } @@ -3588,12 +3636,16 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { - if (token >= llama_n_vocab(ctx)) { +const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) { + if (token >= llama_n_vocab_from_model(model)) { return nullptr; } - return ctx->vocab.id_to_token[token].tok.c_str(); + return model->vocab.id_to_token[token].tok.c_str(); +} + +const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { + return llama_token_to_str_with_model(&ctx->model, token); } llama_token llama_token_bos() { diff --git a/llama.h b/llama.h index 4596b1ed4dedf..e744584f23fb3 100644 --- a/llama.h +++ b/llama.h @@ -89,6 +89,11 @@ extern "C" { int32_t n_gpu_layers; // number of layers to store in VRAM int32_t main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + + // ref: https://github.com/ggerganov/llama.cpp/pull/2054 + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback @@ -270,10 +275,21 @@ extern "C" { int n_max_tokens, bool add_bos); + LLAMA_API int llama_tokenize_with_model( + const struct llama_model * model, + const char * text, + llama_token * tokens, + int n_max_tokens, + bool add_bos); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx); + LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); + LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); + LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); + // Get the vocabulary as output parameters. // Returns number of results. LLAMA_API int llama_get_vocab( @@ -282,6 +298,12 @@ extern "C" { float * scores, int capacity); + LLAMA_API int llama_get_vocab_from_model( + const struct llama_model * model, + const char * * strings, + float * scores, + int capacity); + // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row // Can be mutated in order to change the probabilities of the next token @@ -294,7 +316,13 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str( + const struct llama_context * ctx, + llama_token token); + + LLAMA_API const char * llama_token_to_str_with_model( + const struct llama_model * model, + llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py old mode 100644 new mode 100755 index d127482819f46..307b7c08d86da --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,3 +1,5 @@ +#!/bin/env python3 + import os import hashlib