Skip to content

Commit

Permalink
Merge 'origin/master' into hipblas
Browse files Browse the repository at this point in the history
  • Loading branch information
SlyEcho committed May 27, 2023
2 parents 4c8b3fb + 0ecb1bb commit a4648c1
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 19 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ jobs:
- build: 'clblast'
defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include"'
defines: '-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

steps:
- name: Clone
Expand All @@ -187,7 +187,7 @@ jobs:
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/clblast_release_dir clblast
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
Expand All @@ -213,7 +213,6 @@ jobs:
cd build
cmake .. ${{ matrix.defines }}
cmake --build . --config Release
cp ../LICENSE ./bin/Release/llama.cpp.txt
- name: Add clblast.dll
id: add_clblast_dll
Expand Down Expand Up @@ -258,6 +257,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
- name: Upload artifacts
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ endif()
# 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
option(LLAMA_BLAS_VENDOR "llama: BLA_VENDOR from https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" Generic)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
Expand Down
2 changes: 1 addition & 1 deletion examples/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ These options help improve the performance and memory usage of the LLaMA models.

### Prompt Caching

- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

### Quantization

Expand Down
18 changes: 14 additions & 4 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ int main(int argc, char ** argv) {
return 0;
}

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
Expand All @@ -155,6 +153,7 @@ int main(int argc, char ** argv) {
return 1;
}
session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed);

fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else {
Expand All @@ -163,7 +162,16 @@ int main(int argc, char ** argv) {
}

// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
std::vector<llama_token> embd_inp;

if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}

const int n_ctx = llama_n_ctx(ctx);

Expand All @@ -181,7 +189,9 @@ int main(int argc, char ** argv) {
}
n_matching_session_tokens++;
}
if (n_matching_session_tokens >= embd_inp.size()) {
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
Expand Down
58 changes: 49 additions & 9 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -3494,7 +3494,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
};
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");

static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"NONE",

"DUP",
Expand Down Expand Up @@ -3749,6 +3749,9 @@ const char * ggml_type_name(enum ggml_type type) {
return GGML_TYPE_NAME[type];
}

const char * ggml_op_name(enum ggml_op op) {
return GGML_OP_NAME[op];
}

size_t ggml_element_size(const struct ggml_tensor * tensor) {
return GGML_TYPE_SIZE[tensor->type];
Expand Down Expand Up @@ -3805,6 +3808,10 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
return wtype;
}

size_t ggml_tensor_overhead(void) {
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
}

static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
Expand Down Expand Up @@ -4017,6 +4024,10 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}

void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
ctx->no_alloc = no_alloc;
}

// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
Expand Down Expand Up @@ -4061,7 +4072,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);

if (ctx->scratch.data == NULL || data != NULL) {
size_needed += sizeof(struct ggml_tensor);
size_needed += GGML_TENSOR_SIZE;

if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
Expand All @@ -4077,14 +4088,15 @@ struct ggml_tensor * ggml_new_tensor_impl(
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
__func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}

if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
__func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
assert(false);
return NULL;
}
Expand All @@ -4093,7 +4105,7 @@ struct ggml_tensor * ggml_new_tensor_impl(

*obj_new = (struct ggml_object) {
.offs = cur_end + GGML_OBJECT_SIZE,
.size = sizeof(struct ggml_tensor),
.size = GGML_TENSOR_SIZE,
.next = NULL,
};

Expand Down Expand Up @@ -13792,11 +13804,19 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
// reached a leaf node, not part of the gradient graph (e.g. a constant)
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);

if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
}

cgraph->leafs[cgraph->n_leafs] = node;
cgraph->n_leafs++;
} else {
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);

if (strlen(node->name) == 0) {
snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
}

cgraph->nodes[cgraph->n_nodes] = node;
cgraph->grads[cgraph->n_nodes] = node->grad;
cgraph->n_nodes++;
Expand Down Expand Up @@ -14510,6 +14530,26 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
}
}

struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * leaf = cgraph->leafs[i];

if (strcmp(leaf->name, name) == 0) {
return leaf;
}
}

for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];

if (strcmp(node->name, name) == 0) {
return node;
}
}

return NULL;
}

void ggml_graph_print(const struct ggml_cgraph * cgraph) {
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};

Expand All @@ -14527,7 +14567,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i,
node->ne[0], node->ne[1], node->ne[2],
GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0,
Expand All @@ -14541,15 +14581,15 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
i,
node->ne[0], node->ne[1],
GGML_OP_LABEL[node->op]);
GGML_OP_NAME[node->op]);
}

for (int i = 0; i < GGML_OP_COUNT; i++) {
if (perf_total_per_op_us[i] == 0) {
continue;
}

GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0);
}

GGML_PRINT("========================================\n");
Expand Down
12 changes: 11 additions & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@
#define GGML_MAX_PARAMS 256
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_MAX_NAME 32
#define GGML_DEFAULT_N_THREADS 4

#define GGML_ASSERT(x) \
Expand Down Expand Up @@ -372,11 +373,13 @@ extern "C" {

void * data;

char name[32];
char name[GGML_MAX_NAME];

char padding[16];
};

static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

// computation graph
struct ggml_cgraph {
int n_nodes;
Expand Down Expand Up @@ -429,6 +432,7 @@ extern "C" {
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);

GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);

Expand All @@ -437,6 +441,9 @@ extern "C" {
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);

// main

GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
Expand All @@ -445,6 +452,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);

GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);

GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
Expand Down Expand Up @@ -970,6 +978,8 @@ extern "C" {
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);

GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);

// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

Expand Down

0 comments on commit a4648c1

Please sign in to comment.