From e522bad948f0816eaba3292ba36bc252fbc5897a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Mar 2025 23:24:27 +0000 Subject: [PATCH 1/9] chore(deps): bump pypa/cibuildwheel from 2.22.0 to 2.23.2 Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.22.0 to 2.23.2. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.22.0...v2.23.2) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/build-and-release.yaml | 4 ++-- .github/workflows/build-wheels-metal.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index 7307c85ab..9efe440cf 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -42,7 +42,7 @@ jobs: shell: cmd - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" @@ -69,7 +69,7 @@ jobs: platforms: linux/arm64 - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: CIBW_SKIP: "*musllinux* pp*" CIBW_REPAIR_WHEEL_COMMAND: "" diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index 9b97bf2f5..5bc44f2ea 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -43,7 +43,7 @@ jobs: shell: cmd - name: Build wheels - uses: pypa/cibuildwheel@v2.22.0 + uses: pypa/cibuildwheel@v2.23.2 env: # disable repair CIBW_REPAIR_WHEEL_COMMAND: "" From f33dde30a1597b0e9d62bc7f35cb42a2e9910593 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Mon, 31 Mar 2025 04:15:39 +0900 Subject: [PATCH 2/9] feat: Add Gemma3 chat handler (#1976) --- llama_cpp/llama_chat_format.py | 89 ++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 17575c700..0d6d39cb8 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3373,6 +3373,95 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler): ) +class Gemma3ChatHandler(Llava15ChatHandler): + # Chat Format: + # 'user\n{system_prompt}\n\n{prompt}\nmodel\n' + + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = ( + "{{ '' }}" + "{%- if messages[0]['role'] == 'system' -%}" + "{%- if messages[0]['content'] is string -%}" + "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}" + "{%- else -%}" + "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}" + "{%- endif -%}" + "{%- set loop_messages = messages[1:] -%}" + "{%- else -%}" + "{%- set first_user_prefix = \"\" -%}" + "{%- set loop_messages = messages -%}" + "{%- endif -%}" + "{%- for message in loop_messages -%}" + "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}" + "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}" + "{%- endif -%}" + "{%- if (message['role'] == 'assistant') -%}" + "{%- set role = \"model\" -%}" + "{%- else -%}" + "{%- set role = message['role'] -%}" + "{%- endif -%}" + "{{ '' + role + '\n' + (first_user_prefix if loop.first else \"\") }}" + "{%- if message['content'] is string -%}" + "{{ message['content'] | trim }}" + "{%- elif message['content'] is iterable -%}" + "{%- for item in message['content'] -%}" + "{%- if item['type'] == 'image' -%}" + "{{ '' }}" + "{%- elif item['type'] == 'text' -%}" + "{{ item['text'] | trim }}" + "{%- endif -%}" + "{%- endfor -%}" + "{%- else -%}" + "{{ raise_exception(\"Invalid content type\") }}" + "{%- endif -%}" + "{{ '\n' }}" + "{%- endfor -%}" + "{%- if add_generation_prompt -%}" + "{{ 'model\n' }}" + "{%- endif -%}" + ) + + @staticmethod + def split_text_on_image_urls(text: str, image_urls: List[str]): + split_text: List[Tuple[Literal["text", "image_url"], str]] = [] + copied_urls = image_urls[:] + remaining = text + image_placeholder = "" + + while remaining: + # Find placeholder + pos = remaining.find(image_placeholder) + if pos != -1: + assert len(copied_urls) > 0 + if pos > 0: + split_text += [("text", remaining[:pos])] + split_text += [("text", "\n\n")] + split_text += [("image_url", copied_urls.pop(0))] + split_text += [("text", "\n\n")] + remaining = remaining[pos + len(image_placeholder):] + else: + assert len(copied_urls) == 0 + split_text.append(("text", remaining)) + remaining = "" + return split_text + + @staticmethod + def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): + image_urls: List[str] = [] + for message in messages: + if message["role"] == "user": + if message.get("content") is None: + continue + for content in message["content"]: + if isinstance(content, dict) and content.get("type") == "image": + if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str): + image_urls.append(content["image"]["url"]) + elif isinstance(content.get("url"), str): + image_urls.append(content["url"]) + return image_urls + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, From 25b2f8fe0d92cb27e364d3c9601dde77e50446bf Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Thu, 3 Apr 2025 06:25:21 +0900 Subject: [PATCH 3/9] resolve the image embedding issue in gemma3 --- llama_cpp/llama_chat_format.py | 101 ++++++++++++++++++++++------- llama_cpp/llava_cpp.py | 112 +++++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+), 22 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 0d6d39cb8..7ac0f4016 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -2835,24 +2835,7 @@ def __call__( ) llama.eval(tokens) else: - image_bytes = self.load_image(value) - embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) - if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): - raise ValueError( - f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" - ) - n_past = ctypes.c_int(llama.n_tokens) - n_past_p = ctypes.pointer(n_past) - with suppress_stdout_stderr(disable=self.verbose): - self._llava_cpp.llava_eval_image_embed( - llama.ctx, - embed, - llama.n_batch, - n_past_p, - ) - # Required to avoid issues with hf tokenizer - llama.input_ids[llama.n_tokens : n_past.value] = -1 - llama.n_tokens = n_past.value + self.eval_image(llama, value) # Get prompt tokens to avoid a cache miss prompt = llama.input_ids[: llama.n_tokens].tolist() @@ -2938,6 +2921,26 @@ def __call__( ) return _convert_completion_to_chat(completion_or_chunks, stream=stream) + def eval_image(self, llama: llama.Llama, image_url: str): + image_bytes = self.load_image(image_url) + embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) + if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" + ) + n_past = ctypes.c_int(llama.n_tokens) + n_past_p = ctypes.pointer(n_past) + with suppress_stdout_stderr(disable=self.verbose): + self._llava_cpp.llava_eval_image_embed( + llama.ctx, + embed, + llama.n_batch, + n_past_p, + ) + # Required to avoid issues with hf tokenizer + llama.input_ids[llama.n_tokens : n_past.value] = -1 + llama.n_tokens = n_past.value + @staticmethod def _load_image(image_url: str) -> bytes: # TODO: Add Pillow support for other image formats beyond (jpg, png) @@ -3435,10 +3438,10 @@ def split_text_on_image_urls(text: str, image_urls: List[str]): if pos != -1: assert len(copied_urls) > 0 if pos > 0: - split_text += [("text", remaining[:pos])] - split_text += [("text", "\n\n")] - split_text += [("image_url", copied_urls.pop(0))] - split_text += [("text", "\n\n")] + split_text.append(("text", remaining[:pos])) + split_text.append(("text", "\n\n")) + split_text.append(("image_url", copied_urls.pop(0))) + split_text.append(("text", "\n\n")) remaining = remaining[pos + len(image_placeholder):] else: assert len(copied_urls) == 0 @@ -3461,6 +3464,60 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): image_urls.append(content["url"]) return image_urls + def eval_image(self, llama: llama.Llama, image_url: str): + import llama_cpp + + img_bytes = self.load_image(image_url) + img_u8_p = self._llava_cpp.clip_image_u8_init() + if not self._llava_cpp.clip_image_load_from_bytes( + ctypes.create_string_buffer(img_bytes, len(img_bytes)), + ctypes.c_size_t(len(img_bytes)), + img_u8_p, + ): + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to load image.") + + img_f32 = self._llava_cpp.clip_image_f32_batch() + img_f32_p = ctypes.byref(img_f32) + if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p): + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to preprocess image.") + + n_embd = llama_cpp.llama_model_n_embd(llama._model.model) + n_tokens = 256 + embed = (ctypes.c_float * (n_tokens * n_embd))() + if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed): + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + raise ValueError("Failed to encode image.") + + self._llava_cpp.clip_image_f32_batch_free(img_f32_p) + self._llava_cpp.clip_image_u8_free(img_u8_p) + llama_cpp.llama_set_causal_attn(llama.ctx, False) + + seq_id_0 = (ctypes.c_int32 * 1)() + seq_ids = (ctypes.POINTER(ctypes.c_int32) * (n_tokens + 1))() + for i in range(n_tokens): + seq_ids[i] = seq_id_0 + + batch = llama_cpp.llama_batch() + batch.n_tokens = n_tokens + batch.token = None + batch.embd = embed + batch.pos = (ctypes.c_int32 * n_tokens)(*[i + llama.n_tokens for i in range(n_tokens)]) + batch.seq_id = seq_ids + batch.n_seq_id = (ctypes.c_int32 * n_tokens)(*([1] * n_tokens)) + batch.logits = (ctypes.c_int8 * n_tokens)() + + if llama_cpp.llama_decode(llama.ctx, batch): + raise ValueError("Failed to decode image.") + + llama_cpp.llama_set_causal_attn(llama.ctx, True) + # Required to avoid issues with hf tokenizer + llama.input_ids[llama.n_tokens : llama.n_tokens + n_tokens] = -1 + llama.n_tokens += n_tokens + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index d9dfaf5fd..46ac5087f 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -7,6 +7,7 @@ c_int, c_uint8, c_float, + c_size_t, c_void_p, POINTER, _Pointer, # type: ignore @@ -141,6 +142,28 @@ def llava_eval_image_embed( ################################################ +# struct clip_image_u8_batch { +# struct clip_image_u8 * data; +# size_t size; +# }; +class clip_image_u8_batch(Structure): + _fields_ = [ + ("data", c_void_p), + ("size", c_size_t), + ] + + +# struct clip_image_f32_batch { +# struct clip_image_f32 * data; +# size_t size; +# }; +class clip_image_f32_batch(Structure): + _fields_ = [ + ("data", c_void_p), + ("size", c_size_t), + ] + + # /** load mmproj model */ # CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) @@ -156,3 +179,92 @@ def clip_model_load( def clip_free(ctx: clip_ctx_p, /): ... + +# CLIP_API struct clip_image_u8 * clip_image_u8_init (); +@ctypes_function("clip_image_u8_init", [], c_void_p) +def clip_image_u8_init() -> Optional[c_void_p]: + ... + + +# CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); +@ctypes_function("clip_image_u8_free", [c_void_p], None) +def clip_image_u8_free(img: c_void_p, /): + ... + + +# CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); +@ctypes_function("clip_image_f32_free", [c_void_p], None) +def clip_image_f32_free(img: c_void_p, /): + ... + + +# CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +@ctypes_function("clip_image_u8_batch_free", [POINTER(clip_image_u8_batch)], None) +def clip_image_u8_batch_free(batch: "_Pointer[clip_image_u8_batch]", /): + ... + + +# CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); +@ctypes_function("clip_image_f32_batch_free", [POINTER(clip_image_f32_batch)], None) +def clip_image_f32_batch_free(batch: "_Pointer[clip_image_f32_batch]", /): + ... + + +# /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ +# CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); +@ctypes_function( + "clip_image_preprocess", + [ + clip_ctx_p_ctypes, + c_void_p, + POINTER(clip_image_f32_batch), + ], + c_bool, +) +def clip_image_preprocess( + ctx: clip_ctx_p, + img: c_void_p, + res_imgs: "_Pointer[clip_image_f32_batch]", + /, +) -> bool: + ... + + +# CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); +@ctypes_function( + "clip_image_batch_encode", + [ + clip_ctx_p_ctypes, + c_int, + POINTER(clip_image_f32_batch), + POINTER(c_float), + ], + c_bool, +) +def clip_image_batch_encode( + ctx: clip_ctx_p, + n_threads: c_int, + imgs: "_Pointer[clip_image_f32_batch]", + vec: c_void_p +) -> bool: + ... + + +# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); +@ctypes_function( + "clip_image_load_from_bytes", + [ + c_void_p, + c_size_t, + c_void_p, + ], + c_bool, +) +def clip_image_load_from_bytes( + bytes: c_void_p, + bytes_length: c_size_t, + img: c_void_p, + /, +) -> bool: + ... From 1b455888d40aa2f64ace593ddeb7c54a3087d631 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:43:58 +0900 Subject: [PATCH 4/9] fix: added n_ctx check for prompt requirements when embedding images in Gemma3ChatHandler --- llama_cpp/llama_chat_format.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 7ac0f4016..cbac975bd 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3467,6 +3467,12 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): def eval_image(self, llama: llama.Llama, image_url: str): import llama_cpp + n_tokens = 256 + if llama.n_tokens + n_tokens > llama.n_ctx(): + raise ValueError( + f"Prompt exceeds n_ctx: {llama.n_tokens + n_tokens} > {llama.n_ctx()}" + ) + img_bytes = self.load_image(image_url) img_u8_p = self._llava_cpp.clip_image_u8_init() if not self._llava_cpp.clip_image_load_from_bytes( @@ -3485,7 +3491,6 @@ def eval_image(self, llama: llama.Llama, image_url: str): raise ValueError("Failed to preprocess image.") n_embd = llama_cpp.llama_model_n_embd(llama._model.model) - n_tokens = 256 embed = (ctypes.c_float * (n_tokens * n_embd))() if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed): self._llava_cpp.clip_image_f32_batch_free(img_f32_p) From 360b04c4e69f9dfea29cdc30a1bbe7bf88ce84ce Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 16:07:58 +0200 Subject: [PATCH 5/9] update to match llama.cpp e0e912f api --- llama_cpp/llama_cpp.py | 125 ++++++++++++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 33 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index f3985ad2f..170020654 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -165,6 +165,10 @@ # llama_sampler_p = NewType("llama_sampler_p", int) # llama_sampler_p_ctypes = ctypes.c_void_p +# struct llama_kv_cache; +llama_kv_cache_p = NewType("llama_kv_cache_p", int) +llama_kv_cache_p_ctypes = ctypes.c_void_p + # typedef int32_t llama_pos; llama_pos = ctypes.c_int32 # typedef int32_t llama_token; @@ -259,7 +263,9 @@ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28 LLAMA_VOCAB_PRE_TYPE_GPT4O = 29 - +LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30 +LLAMA_VOCAB_PRE_TYPE_TRILLION = 31 +LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32 # // note: these values should be synchronized with ggml_rope # // TODO: maybe move this enum to ggml.h (ggml_rope_type) @@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure): value: Union[int, float, bool, bytes] + +# struct llama_model_tensor_buft_override { +# const char * pattern; +# ggml_backend_buffer_type_t buft; +# +# }; +class llama_model_tensor_buft_override(ctypes.Structure): + _fields_ = [ + ("pattern", ctypes.c_char_p), + ("buft", ctypes.c_void_p) + ] + + +llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override) + + # struct llama_model_params { # // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) # ggml_backend_dev_t * devices; +# // NULL-terminated list of buffer types to use for tensors that match a pattern +# const struct llama_model_tensor_buft_override * tensor_buft_overrides; + # int32_t n_gpu_layers; // number of layers to store in VRAM # enum llama_split_mode split_mode; // how to split the model across multiple GPUs @@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure): _fields_ = [ ("devices", ctypes.c_void_p), # NOTE: unnused + ("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p), ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int: def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: ... +# LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx); +@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes) +def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: + ... # LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) @@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll # // Returns the number of tokens in the KV cache (slow, use only for debug) # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times -# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx); +# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); +@ctypes_function( + "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 +) +def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: + """Returns the number of tokens in the KV cache (slow, use only for debug) + If a KV cell has multiple sequences assigned to it, it will be counted multiple times + """ + ... + +# // Returns the number of tokens in the KV cache (slow, use only for debug) +# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times +# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead"); @ctypes_function( "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32 ) @@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int: # // Clear the KV cache - both cell info is erased and KV data is zeroed -# LLAMA_API void llama_kv_cache_clear( +# LLAMA_API void llama_kv_self_clear( # struct llama_context * ctx); -@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None) -def llama_kv_cache_clear(ctx: llama_context_p, /): +@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None) +def llama_kv_self_clear(ctx: llama_context_p, /): """Clear the KV cache""" ... @@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): # // seq_id < 0 : match any sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API bool llama_kv_cache_seq_rm( +# LLAMA_API bool llama_kv_self_seq_rm( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1); @ctypes_function( - "llama_kv_cache_seq_rm", + "llama_kv_self_seq_rm", [ llama_context_p_ctypes, llama_seq_id, @@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /): ], ctypes.c_bool, ) -def llama_kv_cache_seq_rm( +def llama_kv_self_seq_rm( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm( # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_cp( +# LLAMA_API void llama_kv_self_seq_cp( # struct llama_context * ctx, # llama_seq_id seq_id_src, # llama_seq_id seq_id_dst, # llama_pos p0, # llama_pos p1); @ctypes_function( - "llama_kv_cache_seq_cp", + "llama_kv_self_seq_cp", [ llama_context_p_ctypes, llama_seq_id, @@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm( ], None, ) -def llama_kv_cache_seq_cp( +def llama_kv_self_seq_cp( ctx: llama_context_p, seq_id_src: Union[llama_seq_id, int], seq_id_dst: Union[llama_seq_id, int], @@ -1914,13 +1956,13 @@ def llama_kv_cache_seq_cp( # // Removes all tokens that do not belong to the specified sequence -# LLAMA_API void llama_kv_cache_seq_keep( +# LLAMA_API void llama_kv_self_seq_keep( # struct llama_context * ctx, # llama_seq_id seq_id); @ctypes_function( - "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None + "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None ) -def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): +def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): """Removes all tokens that do not belong to the specified sequence""" ... @@ -1928,17 +1970,17 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) # // If the KV cache is RoPEd, the KV data is updated accordingly: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() +# // - explicitly with llama_kv_self_update() # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_add( +# LLAMA_API void llama_kv_self_seq_add( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, # llama_pos delta); @ctypes_function( - "llama_kv_cache_seq_add", + "llama_kv_self_seq_add", [ llama_context_p_ctypes, llama_seq_id, @@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in ], None, ) -def llama_kv_cache_seq_add( +def llama_kv_self_seq_add( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add( """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) If the KV cache is RoPEd, the KV data is updated accordingly: - lazily on next llama_decode() - - explicitly with llama_kv_cache_update() + - explicitly with llama_kv_self_update() p0 < 0 : [0, p1] p1 < 0 : [p0, inf)""" ... @@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add( # // If the KV cache is RoPEd, the KV data is updated accordingly # // p0 < 0 : [0, p1] # // p1 < 0 : [p0, inf) -# LLAMA_API void llama_kv_cache_seq_div( +# LLAMA_API void llama_kv_self_seq_div( # struct llama_context * ctx, # llama_seq_id seq_id, # llama_pos p0, # llama_pos p1, # int d); @ctypes_function( - "llama_kv_cache_seq_div", + "llama_kv_self_seq_div", [ llama_context_p_ctypes, llama_seq_id, @@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add( ], None, ) -def llama_kv_cache_seq_div( +def llama_kv_self_seq_div( ctx: llama_context_p, seq_id: Union[llama_seq_id, int], p0: Union[llama_pos, int], @@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div( # // Defragment the KV cache # // This will be applied: # // - lazily on next llama_decode() -# // - explicitly with llama_kv_cache_update() -# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None) -def llama_kv_cache_defrag(ctx: llama_context_p, /): +# // - explicitly with llama_kv_self_update() +# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx); +@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) +def llama_kv_self_defrag(ctx: llama_context_p, /): """Defragment the KV cache This will be applied: - lazily on next llama_decode() - - explicitly with llama_kv_cache_update()""" + - explicitly with llama_kv_self_update()""" ... # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) -# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None) -def llama_kv_cache_update(ctx: llama_context_p, /): +# LLAMA_API void llama_kv_self_update(struct llama_context * ctx); +@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) +def llama_kv_self_update(ctx: llama_context_p, /): """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)""" ... # // Check if the context supports KV cache shifting -# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx); -@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool: +# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx); +@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) +def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: """Check if the context supports KV cache shifting""" ... @@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /): ... +# // Set whether the model is in warmup mode or not +# // If true, all model tensors are activated during llama_decode() to load and cache their weights. +# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); +@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None) +def llama_set_warmup(ctx: llama_context_p, warmup: bool, /): + """Set whether to use causal attention or not + If set to true, the model will only attend to the past tokens""" + ... + + # // Set abort callback # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); @ctypes_function( @@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2( ... + + + +# /// @details Intializes a GBNF grammar, see grammars/README.md for details. +# /// @param vocab The vocabulary that this grammar will be used with. +# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails. +# /// @param grammar_root The name of the start symbol for the grammar. # LLAMA_API struct llama_sampler * llama_sampler_init_grammar( # const struct llama_vocab * vocab, # const char * grammar_str, From 924833237f881339124bd47425b4049af01298f7 Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 17:01:27 +0200 Subject: [PATCH 6/9] update _internals and _utils to match API changes, add deprecation warnings --- llama_cpp/_internals.py | 29 +++++++++++++++++++++++++---- llama_cpp/_utils.py | 16 ++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 343581dce..57a9a5ab4 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -13,6 +13,11 @@ from dataclasses import dataclass, field from contextlib import ExitStack +try: + from warnings import deprecated +except ImportError: + from ._utils import deprecated + import numpy as np import numpy.typing as npt @@ -276,21 +281,37 @@ def n_ctx(self) -> int: def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) + @deprecated("Use llama_kv_self_clear") def kv_cache_clear(self): - llama_cpp.llama_kv_cache_clear(self.ctx) + self.llama_kv_self_clear() + @deprecated("Use kv_self_seq_rm") def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + self.kv_self_seq_rm(seq_id, p0, p1) + @deprecated("Use kv_self_seq_cp") def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1) + @deprecated("Use kv_self_seq_keep") def kv_cache_seq_keep(self, seq_id: int): - llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) + self.kv_self_seq_keep(seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) + def llama_kv_self_clear(self): + llama_cpp.llama_llama_kv_self_clear(self.ctx) + + def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int): + llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1) + + def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): + llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + + def kv_self_seq_keep(self, seq_id: int): + llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id) + def get_state_size(self) -> int: return llama_cpp.llama_get_state_size(self.ctx) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 29628193b..75b39f694 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -1,5 +1,7 @@ import os import sys +import warnings +import functools from typing import Any, Dict @@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton): def __init__(self): super(Singleton, self).__init__() + + +def deprecated(reason): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + f"Call to deprecated function {func.__name__} ({reason}).", + category=DeprecationWarning, + stacklevel=2, + ) + return func(*args, **kwargs) + return wrapper + return decorator From b6e3c89826c4138283496ed36ddf2c32ec813d19 Mon Sep 17 00:00:00 2001 From: marme Date: Thu, 3 Apr 2025 17:18:18 +0200 Subject: [PATCH 7/9] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7841fc723..2004644b7 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7841fc723e059d1fd9640e5c0ef19050fcc7c698 +Subproject commit 2004644b7a5da6fe080e51861ab583480280f1d3 From 025e7fa44bfd071eb36b5641448c4e80a0b29917 Mon Sep 17 00:00:00 2001 From: kossum <127719370+kossum@users.noreply.github.com> Date: Fri, 4 Apr 2025 20:17:26 +0900 Subject: [PATCH 8/9] fix: modify the gemma3 chat template to be compatible with openai api --- llama_cpp/llama_chat_format.py | 17 +---------------- llama_cpp/llava_cpp.py | 3 ++- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index cbac975bd..4e1aad381 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3409,7 +3409,7 @@ class Gemma3ChatHandler(Llava15ChatHandler): "{{ message['content'] | trim }}" "{%- elif message['content'] is iterable -%}" "{%- for item in message['content'] -%}" - "{%- if item['type'] == 'image' -%}" + "{%- if item['type'] == 'image_url' -%}" "{{ '' }}" "{%- elif item['type'] == 'text' -%}" "{{ item['text'] | trim }}" @@ -3449,21 +3449,6 @@ def split_text_on_image_urls(text: str, image_urls: List[str]): remaining = "" return split_text - @staticmethod - def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]): - image_urls: List[str] = [] - for message in messages: - if message["role"] == "user": - if message.get("content") is None: - continue - for content in message["content"]: - if isinstance(content, dict) and content.get("type") == "image": - if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str): - image_urls.append(content["image"]["url"]) - elif isinstance(content.get("url"), str): - image_urls.append(content["url"]) - return image_urls - def eval_image(self, llama: llama.Llama, image_url: str): import llama_cpp diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py index 46ac5087f..8a382b4d9 100644 --- a/llama_cpp/llava_cpp.py +++ b/llama_cpp/llava_cpp.py @@ -245,7 +245,8 @@ def clip_image_batch_encode( ctx: clip_ctx_p, n_threads: c_int, imgs: "_Pointer[clip_image_f32_batch]", - vec: c_void_p + vec: c_void_p, + /, ) -> bool: ... From 8f21c9000425b4ebf437c6c95bae5e1c53d53173 Mon Sep 17 00:00:00 2001 From: bot08 <71845954+bot08@users.noreply.github.com> Date: Mon, 7 Apr 2025 22:43:17 +0300 Subject: [PATCH 9/9] Update README.md --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index e00456580..3e5c22617 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,20 @@ +# Example installing from this fork with CUDA support (PowerShell) + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +pip install git+https://github.com/bot08/llama-cpp-python.git@main +``` + +To force a clean rebuild: + +```powershell +$env:CMAKE_ARGS = "-DGGML_CUDA=on" +pip install --upgrade --force-reinstall --no-cache-dir ` + git+https://github.com/bot08/llama-cpp-python.git@main +``` + +--- +