From e522bad948f0816eaba3292ba36bc252fbc5897a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Mar 2025 23:24:27 +0000
Subject: [PATCH 1/9] chore(deps): bump pypa/cibuildwheel from 2.22.0 to 2.23.2

Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.22.0 to 2.23.2.
- [Release notes](https://github.com/pypa/cibuildwheel/releases)
- [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md)
- [Commits](https://github.com/pypa/cibuildwheel/compare/v2.22.0...v2.23.2)

---
updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/build-and-release.yaml  | 4 ++--
 .github/workflows/build-wheels-metal.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 7307c85ab..9efe440cf 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -42,7 +42,7 @@ jobs:
         shell: cmd
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.22.0
+        uses: pypa/cibuildwheel@v2.23.2
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -69,7 +69,7 @@ jobs:
           platforms: linux/arm64
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.22.0
+        uses: pypa/cibuildwheel@v2.23.2
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index 9b97bf2f5..5bc44f2ea 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -43,7 +43,7 @@ jobs:
         shell: cmd
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.22.0
+        uses: pypa/cibuildwheel@v2.23.2
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""

From f33dde30a1597b0e9d62bc7f35cb42a2e9910593 Mon Sep 17 00:00:00 2001
From: kossum <127719370+kossum@users.noreply.github.com>
Date: Mon, 31 Mar 2025 04:15:39 +0900
Subject: [PATCH 2/9] feat: Add Gemma3 chat handler (#1976)

---
 llama_cpp/llama_chat_format.py | 89 ++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 17575c700..0d6d39cb8 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3373,6 +3373,95 @@ class MiniCPMv26ChatHandler(Llava15ChatHandler):
     )
 
 
+class Gemma3ChatHandler(Llava15ChatHandler):
+    # Chat Format:
+    # '<bos><start_of_turn>user\n{system_prompt}\n\n{prompt}<end_of_turn>\n<start_of_turn>model\n'
+
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = (
+        "{{ '<bos>' }}"
+        "{%- if messages[0]['role'] == 'system' -%}"
+        "{%- if messages[0]['content'] is string -%}"
+        "{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}"
+        "{%- else -%}"
+        "{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}"
+        "{%- endif -%}"
+        "{%- set loop_messages = messages[1:] -%}"
+        "{%- else -%}"
+        "{%- set first_user_prefix = \"\" -%}"
+        "{%- set loop_messages = messages -%}"
+        "{%- endif -%}"
+        "{%- for message in loop_messages -%}"
+        "{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}"
+        "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
+        "{%- endif -%}"
+        "{%- if (message['role'] == 'assistant') -%}"
+        "{%- set role = \"model\" -%}"
+        "{%- else -%}"
+        "{%- set role = message['role'] -%}"
+        "{%- endif -%}"
+        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}"
+        "{%- if message['content'] is string -%}"
+        "{{ message['content'] | trim }}"
+        "{%- elif message['content'] is iterable -%}"
+        "{%- for item in message['content'] -%}"
+        "{%- if item['type'] == 'image' -%}"
+        "{{ '<start_of_image>' }}"
+        "{%- elif item['type'] == 'text' -%}"
+        "{{ item['text'] | trim }}"
+        "{%- endif -%}"
+        "{%- endfor -%}"
+        "{%- else -%}"
+        "{{ raise_exception(\"Invalid content type\") }}"
+        "{%- endif -%}"
+        "{{ '<end_of_turn>\n' }}"
+        "{%- endfor -%}"
+        "{%- if add_generation_prompt -%}"
+        "{{ '<start_of_turn>model\n' }}"
+        "{%- endif -%}"
+    )
+
+    @staticmethod
+    def split_text_on_image_urls(text: str, image_urls: List[str]):
+        split_text: List[Tuple[Literal["text", "image_url"], str]] = []
+        copied_urls = image_urls[:]
+        remaining = text
+        image_placeholder = "<start_of_image>"
+
+        while remaining:
+            # Find placeholder
+            pos = remaining.find(image_placeholder)
+            if pos != -1:
+                assert len(copied_urls) > 0
+                if pos > 0:
+                    split_text += [("text", remaining[:pos])]
+                split_text += [("text", "\n\n<start_of_image>")]
+                split_text += [("image_url", copied_urls.pop(0))]
+                split_text += [("text", "<end_of_image>\n\n")]
+                remaining = remaining[pos + len(image_placeholder):]
+            else:
+                assert len(copied_urls) == 0
+                split_text.append(("text", remaining))
+                remaining = ""
+        return split_text
+
+    @staticmethod
+    def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
+        image_urls: List[str] = []
+        for message in messages:
+            if message["role"] == "user":
+                if message.get("content") is None:
+                    continue
+                for content in message["content"]:
+                    if isinstance(content, dict) and content.get("type") == "image":
+                        if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str):
+                            image_urls.append(content["image"]["url"])
+                        elif isinstance(content.get("url"), str):
+                            image_urls.append(content["url"])
+        return image_urls
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,

From 25b2f8fe0d92cb27e364d3c9601dde77e50446bf Mon Sep 17 00:00:00 2001
From: kossum <127719370+kossum@users.noreply.github.com>
Date: Thu, 3 Apr 2025 06:25:21 +0900
Subject: [PATCH 3/9] resolve the image embedding issue in gemma3

---
 llama_cpp/llama_chat_format.py | 101 ++++++++++++++++++++++-------
 llama_cpp/llava_cpp.py         | 112 +++++++++++++++++++++++++++++++++
 2 files changed, 191 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 0d6d39cb8..7ac0f4016 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2835,24 +2835,7 @@ def __call__(
                     )
                 llama.eval(tokens)
             else:
-                image_bytes = self.load_image(value)
-                embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
-                if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
-                    raise ValueError(
-                        f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
-                    )
-                n_past = ctypes.c_int(llama.n_tokens)
-                n_past_p = ctypes.pointer(n_past)
-                with suppress_stdout_stderr(disable=self.verbose):
-                    self._llava_cpp.llava_eval_image_embed(
-                        llama.ctx,
-                        embed,
-                        llama.n_batch,
-                        n_past_p,
-                    )
-                # Required to avoid issues with hf tokenizer
-                llama.input_ids[llama.n_tokens : n_past.value] = -1
-                llama.n_tokens = n_past.value
+                self.eval_image(llama, value)
 
         # Get prompt tokens to avoid a cache miss
         prompt = llama.input_ids[: llama.n_tokens].tolist()
@@ -2938,6 +2921,26 @@ def __call__(
             )
         return _convert_completion_to_chat(completion_or_chunks, stream=stream)
 
+    def eval_image(self, llama: llama.Llama, image_url: str):
+        image_bytes = self.load_image(image_url)
+        embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
+        if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
+            raise ValueError(
+                f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
+            )
+        n_past = ctypes.c_int(llama.n_tokens)
+        n_past_p = ctypes.pointer(n_past)
+        with suppress_stdout_stderr(disable=self.verbose):
+            self._llava_cpp.llava_eval_image_embed(
+                llama.ctx,
+                embed,
+                llama.n_batch,
+                n_past_p,
+            )
+        # Required to avoid issues with hf tokenizer
+        llama.input_ids[llama.n_tokens : n_past.value] = -1
+        llama.n_tokens = n_past.value
+
     @staticmethod
     def _load_image(image_url: str) -> bytes:
         # TODO: Add Pillow support for other image formats beyond (jpg, png)
@@ -3435,10 +3438,10 @@ def split_text_on_image_urls(text: str, image_urls: List[str]):
             if pos != -1:
                 assert len(copied_urls) > 0
                 if pos > 0:
-                    split_text += [("text", remaining[:pos])]
-                split_text += [("text", "\n\n<start_of_image>")]
-                split_text += [("image_url", copied_urls.pop(0))]
-                split_text += [("text", "<end_of_image>\n\n")]
+                    split_text.append(("text", remaining[:pos]))
+                split_text.append(("text", "\n\n<start_of_image>"))
+                split_text.append(("image_url", copied_urls.pop(0)))
+                split_text.append(("text", "<end_of_image>\n\n"))
                 remaining = remaining[pos + len(image_placeholder):]
             else:
                 assert len(copied_urls) == 0
@@ -3461,6 +3464,60 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
                             image_urls.append(content["url"])
         return image_urls
 
+    def eval_image(self, llama: llama.Llama, image_url: str):
+        import llama_cpp
+
+        img_bytes = self.load_image(image_url)
+        img_u8_p = self._llava_cpp.clip_image_u8_init()
+        if not self._llava_cpp.clip_image_load_from_bytes(
+            ctypes.create_string_buffer(img_bytes, len(img_bytes)),
+            ctypes.c_size_t(len(img_bytes)),
+            img_u8_p,
+        ):
+            self._llava_cpp.clip_image_u8_free(img_u8_p)
+            raise ValueError("Failed to load image.")
+
+        img_f32 = self._llava_cpp.clip_image_f32_batch()
+        img_f32_p = ctypes.byref(img_f32)
+        if not self._llava_cpp.clip_image_preprocess(self.clip_ctx, img_u8_p, img_f32_p):
+            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._llava_cpp.clip_image_u8_free(img_u8_p)
+            raise ValueError("Failed to preprocess image.")
+
+        n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
+        n_tokens = 256
+        embed = (ctypes.c_float * (n_tokens * n_embd))()
+        if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
+            self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+            self._llava_cpp.clip_image_u8_free(img_u8_p)
+            raise ValueError("Failed to encode image.")
+
+        self._llava_cpp.clip_image_f32_batch_free(img_f32_p)
+        self._llava_cpp.clip_image_u8_free(img_u8_p)
+        llama_cpp.llama_set_causal_attn(llama.ctx, False)
+
+        seq_id_0 = (ctypes.c_int32 * 1)()
+        seq_ids = (ctypes.POINTER(ctypes.c_int32) * (n_tokens + 1))()
+        for i in range(n_tokens):
+            seq_ids[i] = seq_id_0
+
+        batch = llama_cpp.llama_batch()
+        batch.n_tokens = n_tokens
+        batch.token = None
+        batch.embd = embed
+        batch.pos = (ctypes.c_int32 * n_tokens)(*[i + llama.n_tokens for i in range(n_tokens)])
+        batch.seq_id = seq_ids
+        batch.n_seq_id = (ctypes.c_int32 * n_tokens)(*([1] * n_tokens))
+        batch.logits = (ctypes.c_int8 * n_tokens)()
+
+        if llama_cpp.llama_decode(llama.ctx, batch):
+            raise ValueError("Failed to decode image.")
+
+        llama_cpp.llama_set_causal_attn(llama.ctx, True)
+        # Required to avoid issues with hf tokenizer
+        llama.input_ids[llama.n_tokens : llama.n_tokens + n_tokens] = -1
+        llama.n_tokens += n_tokens
+
 
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index d9dfaf5fd..46ac5087f 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -7,6 +7,7 @@
     c_int,
     c_uint8,
     c_float,
+    c_size_t,
     c_void_p,
     POINTER,
     _Pointer,  # type: ignore
@@ -141,6 +142,28 @@ def llava_eval_image_embed(
 ################################################
 
 
+# struct clip_image_u8_batch {
+#     struct clip_image_u8 * data;
+#     size_t size;
+# };
+class clip_image_u8_batch(Structure):
+  _fields_ = [
+      ("data", c_void_p),
+      ("size", c_size_t),
+  ]
+
+
+# struct clip_image_f32_batch {
+#     struct clip_image_f32 * data;
+#     size_t size;
+# };
+class clip_image_f32_batch(Structure):
+  _fields_ = [
+      ("data", c_void_p),
+      ("size", c_size_t),
+  ]
+
+
 # /** load mmproj model */
 # CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
 @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes)
@@ -156,3 +179,92 @@ def clip_model_load(
 def clip_free(ctx: clip_ctx_p, /):
     ...
 
+
+# CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+@ctypes_function("clip_image_u8_init", [], c_void_p)
+def clip_image_u8_init() -> Optional[c_void_p]:
+    ...
+
+
+# CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
+@ctypes_function("clip_image_u8_free", [c_void_p], None)
+def clip_image_u8_free(img: c_void_p, /):
+    ...
+
+
+# CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+@ctypes_function("clip_image_f32_free", [c_void_p], None)
+def clip_image_f32_free(img: c_void_p, /):
+    ...
+
+
+# CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+@ctypes_function("clip_image_u8_batch_free", [POINTER(clip_image_u8_batch)], None)
+def clip_image_u8_batch_free(batch: "_Pointer[clip_image_u8_batch]", /):
+    ...
+
+
+# CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+@ctypes_function("clip_image_f32_batch_free", [POINTER(clip_image_f32_batch)], None)
+def clip_image_f32_batch_free(batch: "_Pointer[clip_image_f32_batch]", /):
+    ...
+
+
+# /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+# CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+@ctypes_function(
+    "clip_image_preprocess",
+    [
+        clip_ctx_p_ctypes,
+        c_void_p,
+        POINTER(clip_image_f32_batch),
+    ],
+    c_bool,
+)
+def clip_image_preprocess(
+    ctx: clip_ctx_p,
+    img: c_void_p,
+    res_imgs: "_Pointer[clip_image_f32_batch]",
+    /,
+) -> bool:
+    ...
+
+
+# CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+@ctypes_function(
+    "clip_image_batch_encode",
+    [
+        clip_ctx_p_ctypes,
+        c_int,
+        POINTER(clip_image_f32_batch),
+        POINTER(c_float),
+    ],
+    c_bool,
+)
+def clip_image_batch_encode(
+    ctx: clip_ctx_p,
+    n_threads: c_int,
+    imgs: "_Pointer[clip_image_f32_batch]",
+    vec: c_void_p
+) -> bool:
+    ...
+
+
+# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+@ctypes_function(
+    "clip_image_load_from_bytes",
+    [
+        c_void_p,
+        c_size_t,
+        c_void_p,
+    ],
+    c_bool,
+)
+def clip_image_load_from_bytes(
+    bytes: c_void_p,
+    bytes_length: c_size_t,
+    img: c_void_p,
+    /,
+) -> bool:
+    ...

From 1b455888d40aa2f64ace593ddeb7c54a3087d631 Mon Sep 17 00:00:00 2001
From: kossum <127719370+kossum@users.noreply.github.com>
Date: Thu, 3 Apr 2025 19:43:58 +0900
Subject: [PATCH 4/9] fix: added n_ctx check for prompt requirements when
 embedding images in Gemma3ChatHandler

---
 llama_cpp/llama_chat_format.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 7ac0f4016..cbac975bd 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3467,6 +3467,12 @@ def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
     def eval_image(self, llama: llama.Llama, image_url: str):
         import llama_cpp
 
+        n_tokens = 256
+        if llama.n_tokens + n_tokens > llama.n_ctx():
+            raise ValueError(
+                f"Prompt exceeds n_ctx: {llama.n_tokens + n_tokens} > {llama.n_ctx()}"
+            )
+
         img_bytes = self.load_image(image_url)
         img_u8_p = self._llava_cpp.clip_image_u8_init()
         if not self._llava_cpp.clip_image_load_from_bytes(
@@ -3485,7 +3491,6 @@ def eval_image(self, llama: llama.Llama, image_url: str):
             raise ValueError("Failed to preprocess image.")
 
         n_embd = llama_cpp.llama_model_n_embd(llama._model.model)
-        n_tokens = 256
         embed = (ctypes.c_float * (n_tokens * n_embd))()
         if not self._llava_cpp.clip_image_batch_encode(self.clip_ctx, llama.n_threads, img_f32_p, embed):
             self._llava_cpp.clip_image_f32_batch_free(img_f32_p)

From 360b04c4e69f9dfea29cdc30a1bbe7bf88ce84ce Mon Sep 17 00:00:00 2001
From: marme <marcel.1710@live.de>
Date: Thu, 3 Apr 2025 16:07:58 +0200
Subject: [PATCH 5/9] update to match llama.cpp e0e912f api

---
 llama_cpp/llama_cpp.py | 125 ++++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 33 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index f3985ad2f..170020654 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -165,6 +165,10 @@
 # llama_sampler_p = NewType("llama_sampler_p", int)
 # llama_sampler_p_ctypes = ctypes.c_void_p
 
+# struct llama_kv_cache;
+llama_kv_cache_p = NewType("llama_kv_cache_p", int)
+llama_kv_cache_p_ctypes = ctypes.c_void_p
+
 # typedef int32_t llama_pos;
 llama_pos = ctypes.c_int32
 # typedef int32_t llama_token;
@@ -259,7 +263,9 @@
 LLAMA_VOCAB_PRE_TYPE_MINERVA = 27
 LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28
 LLAMA_VOCAB_PRE_TYPE_GPT4O = 29
-
+LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
+LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
+LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
 
 # // note: these values should be synchronized with ggml_rope
 # // TODO: maybe move this enum to ggml.h (ggml_rope_type)
@@ -630,10 +636,29 @@ class llama_model_kv_override(ctypes.Structure):
         value: Union[int, float, bool, bytes]
 
 
+
+# struct llama_model_tensor_buft_override {
+#       const char * pattern;
+#       ggml_backend_buffer_type_t buft;
+#
+# };
+class llama_model_tensor_buft_override(ctypes.Structure):
+    _fields_ = [
+        ("pattern", ctypes.c_char_p),
+        ("buft", ctypes.c_void_p)
+    ]
+
+
+llama_model_tensor_buft_override_p = ctypes.POINTER(llama_model_tensor_buft_override)
+
+
 # struct llama_model_params {
 #     // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
 #     ggml_backend_dev_t * devices;
 
+#     // NULL-terminated list of buffer types to use for tensors that match a pattern
+#     const struct llama_model_tensor_buft_override * tensor_buft_overrides;
+
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 
@@ -695,6 +720,7 @@ class llama_model_params(ctypes.Structure):
 
     _fields_ = [
         ("devices", ctypes.c_void_p), # NOTE: unnused
+        ("llama_model_tensor_buft_override", llama_model_tensor_buft_override_p),
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -1316,6 +1342,10 @@ def llama_n_vocab(model: llama_vocab_p, /) -> int:
 def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
     ...
 
+# LLAMA_API struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+@ctypes_function("llama_get_kv_self", [llama_context_p_ctypes], llama_model_p_ctypes)
+def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+    ...
 
 # LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 @ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int)
@@ -1810,7 +1840,19 @@ def llama_kv_cache_view_update(ctx: llama_context_p, view: CtypesPointerOrRef[ll
 
 # // Returns the number of tokens in the KV cache (slow, use only for debug)
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-# LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+# LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+@ctypes_function(
+    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+)
+def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+    """Returns the number of tokens in the KV cache (slow, use only for debug)
+    If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    """
+    ...
+
+# // Returns the number of tokens in the KV cache (slow, use only for debug)
+# // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+# DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx), "use llama_kv_self_n_tokens instead");
 @ctypes_function(
     "llama_get_kv_cache_token_count", [llama_context_p_ctypes], ctypes.c_int32
 )
@@ -1832,10 +1874,10 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
-# LLAMA_API void llama_kv_cache_clear(
+# LLAMA_API void llama_kv_self_clear(
 #         struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)
-def llama_kv_cache_clear(ctx: llama_context_p, /):
+@ctypes_function("llama_kv_self_clear", [llama_context_p_ctypes], None)
+def llama_kv_self_clear(ctx: llama_context_p, /):
     """Clear the KV cache"""
     ...
 
@@ -1845,13 +1887,13 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
 # // seq_id < 0 : match any sequence
 # // p0 < 0     : [0,  p1]
 # // p1 < 0     : [p0, inf)
-# LLAMA_API bool llama_kv_cache_seq_rm(
+# LLAMA_API bool llama_kv_self_seq_rm(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1);
 @ctypes_function(
-    "llama_kv_cache_seq_rm",
+    "llama_kv_self_seq_rm",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1860,7 +1902,7 @@ def llama_kv_cache_clear(ctx: llama_context_p, /):
     ],
     ctypes.c_bool,
 )
-def llama_kv_cache_seq_rm(
+def llama_kv_self_seq_rm(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1881,14 +1923,14 @@ def llama_kv_cache_seq_rm(
 # // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_cp(
+# LLAMA_API void llama_kv_self_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id_src,
 #                 llama_seq_id   seq_id_dst,
 #                    llama_pos   p0,
 #                    llama_pos   p1);
 @ctypes_function(
-    "llama_kv_cache_seq_cp",
+    "llama_kv_self_seq_cp",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1898,7 +1940,7 @@ def llama_kv_cache_seq_rm(
     ],
     None,
 )
-def llama_kv_cache_seq_cp(
+def llama_kv_self_seq_cp(
     ctx: llama_context_p,
     seq_id_src: Union[llama_seq_id, int],
     seq_id_dst: Union[llama_seq_id, int],
@@ -1914,13 +1956,13 @@ def llama_kv_cache_seq_cp(
 
 
 # // Removes all tokens that do not belong to the specified sequence
-# LLAMA_API void llama_kv_cache_seq_keep(
+# LLAMA_API void llama_kv_self_seq_keep(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id);
 @ctypes_function(
-    "llama_kv_cache_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
 )
-def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
     """Removes all tokens that do not belong to the specified sequence"""
     ...
 
@@ -1928,17 +1970,17 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 # // If the KV cache is RoPEd, the KV data is updated accordingly:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
+# //   - explicitly with llama_kv_self_update()
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_add(
+# LLAMA_API void llama_kv_self_seq_add(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
 #                    llama_pos   delta);
 @ctypes_function(
-    "llama_kv_cache_seq_add",
+    "llama_kv_self_seq_add",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1948,7 +1990,7 @@ def llama_kv_cache_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, in
     ],
     None,
 )
-def llama_kv_cache_seq_add(
+def llama_kv_self_seq_add(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -1959,7 +2001,7 @@ def llama_kv_cache_seq_add(
     """Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     If the KV cache is RoPEd, the KV data is updated accordingly:
     - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()
+    - explicitly with llama_kv_self_update()
     p0 < 0 : [0,  p1]
     p1 < 0 : [p0, inf)"""
     ...
@@ -1969,14 +2011,14 @@ def llama_kv_cache_seq_add(
 # // If the KV cache is RoPEd, the KV data is updated accordingly
 # // p0 < 0 : [0,  p1]
 # // p1 < 0 : [p0, inf)
-# LLAMA_API void llama_kv_cache_seq_div(
+# LLAMA_API void llama_kv_self_seq_div(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id,
 #                    llama_pos   p0,
 #                    llama_pos   p1,
 #                          int   d);
 @ctypes_function(
-    "llama_kv_cache_seq_div",
+    "llama_kv_self_seq_div",
     [
         llama_context_p_ctypes,
         llama_seq_id,
@@ -1986,7 +2028,7 @@ def llama_kv_cache_seq_add(
     ],
     None,
 )
-def llama_kv_cache_seq_div(
+def llama_kv_self_seq_div(
     ctx: llama_context_p,
     seq_id: Union[llama_seq_id, int],
     p0: Union[llama_pos, int],
@@ -2004,29 +2046,29 @@ def llama_kv_cache_seq_div(
 # // Defragment the KV cache
 # // This will be applied:
 # //   - lazily on next llama_decode()
-# //   - explicitly with llama_kv_cache_update()
-# LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_defrag", [llama_context_p_ctypes], None)
-def llama_kv_cache_defrag(ctx: llama_context_p, /):
+# //   - explicitly with llama_kv_self_update()
+# LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+def llama_kv_self_defrag(ctx: llama_context_p, /):
     """Defragment the KV cache
     This will be applied:
     - lazily on next llama_decode()
-    - explicitly with llama_kv_cache_update()"""
+    - explicitly with llama_kv_self_update()"""
     ...
 
 
 # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_update", [llama_context_p_ctypes], None)
-def llama_kv_cache_update(ctx: llama_context_p, /):
+# LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+def llama_kv_self_update(ctx: llama_context_p, /):
     """Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
     ...
 
 
 # // Check if the context supports KV cache shifting
-# LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
-@ctypes_function("llama_kv_cache_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_cache_can_shift(ctx: llama_context_p, /) -> bool:
+# LLAMA_API bool llama_kv_self_can_shift(struct llama_context * ctx);
+@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
     """Check if the context supports KV cache shifting"""
     ...
 
@@ -2547,6 +2589,16 @@ def llama_set_causal_attn(ctx: llama_context_p, causal_attn: bool, /):
     ...
 
 
+# // Set whether the model is in warmup mode or not
+# // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+# LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+@ctypes_function("llama_set_warmup", [llama_context_p_ctypes, ctypes.c_bool], None)
+def llama_set_warmup(ctx: llama_context_p, warmup: bool, /):
+    """Set whether to use causal attention or not
+    If set to true, the model will only attend to the past tokens"""
+    ...
+
+
 # // Set abort callback
 # LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 @ctypes_function(
@@ -3701,6 +3753,13 @@ def llama_sampler_init_mirostat_v2(
     ...
 
 
+
+
+
+# /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+# /// @param vocab The vocabulary that this grammar will be used with.
+# /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
+# /// @param grammar_root The name of the start symbol for the grammar.
 # LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
 #         const struct llama_vocab * vocab,
 #                       const char * grammar_str,

From 924833237f881339124bd47425b4049af01298f7 Mon Sep 17 00:00:00 2001
From: marme <marcel.1710@live.de>
Date: Thu, 3 Apr 2025 17:01:27 +0200
Subject: [PATCH 6/9] update _internals and _utils to match API changes, add
 deprecation warnings

---
 llama_cpp/_internals.py | 29 +++++++++++++++++++++++++----
 llama_cpp/_utils.py     | 16 ++++++++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 343581dce..57a9a5ab4 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -13,6 +13,11 @@
 from dataclasses import dataclass, field
 from contextlib import ExitStack
 
+try:
+    from warnings import deprecated
+except ImportError:
+    from ._utils import deprecated
+
 import numpy as np
 import numpy.typing as npt
 
@@ -276,21 +281,37 @@ def n_ctx(self) -> int:
     def pooling_type(self) -> int:
         return llama_cpp.llama_pooling_type(self.ctx)
 
+    @deprecated("Use llama_kv_self_clear")
     def kv_cache_clear(self):
-        llama_cpp.llama_kv_cache_clear(self.ctx)
+        self.llama_kv_self_clear()
 
+    @deprecated("Use kv_self_seq_rm")
     def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+        self.kv_self_seq_rm(seq_id, p0, p1)
 
+    @deprecated("Use kv_self_seq_cp")
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+        self.kv_self_seq_cp(seq_id_src, seq_id_dst, p0, p1)
 
+    @deprecated("Use kv_self_seq_keep")
     def kv_cache_seq_keep(self, seq_id: int):
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+        self.kv_self_seq_keep(seq_id)
 
     def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
         llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift)
 
+    def llama_kv_self_clear(self):
+        llama_cpp.llama_llama_kv_self_clear(self.ctx)
+
+    def kv_self_seq_rm(self, seq_id: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_rm(self.ctx, seq_id, p0, p1)
+
+    def kv_self_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        llama_cpp.llama_kv_self_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+
+    def kv_self_seq_keep(self, seq_id: int):
+        llama_cpp.llama_kv_self_seq_keep(self.ctx, seq_id)
+
     def get_state_size(self) -> int:
         return llama_cpp.llama_get_state_size(self.ctx)
 
diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 29628193b..75b39f694 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -1,5 +1,7 @@
 import os
 import sys
+import warnings
+import functools
 
 from typing import Any, Dict
 
@@ -76,3 +78,17 @@ class Singleton(object, metaclass=MetaSingleton):
 
     def __init__(self):
         super(Singleton, self).__init__()
+
+
+def deprecated(reason):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            warnings.warn(
+                f"Call to deprecated function {func.__name__} ({reason}).",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator

From b6e3c89826c4138283496ed36ddf2c32ec813d19 Mon Sep 17 00:00:00 2001
From: marme <marcel.1710@live.de>
Date: Thu, 3 Apr 2025 17:18:18 +0200
Subject: [PATCH 7/9] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7841fc723..2004644b7 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7841fc723e059d1fd9640e5c0ef19050fcc7c698
+Subproject commit 2004644b7a5da6fe080e51861ab583480280f1d3

From 025e7fa44bfd071eb36b5641448c4e80a0b29917 Mon Sep 17 00:00:00 2001
From: kossum <127719370+kossum@users.noreply.github.com>
Date: Fri, 4 Apr 2025 20:17:26 +0900
Subject: [PATCH 8/9] fix: modify the gemma3 chat template to be compatible
 with openai api

---
 llama_cpp/llama_chat_format.py | 17 +----------------
 llama_cpp/llava_cpp.py         |  3 ++-
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index cbac975bd..4e1aad381 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3409,7 +3409,7 @@ class Gemma3ChatHandler(Llava15ChatHandler):
         "{{ message['content'] | trim }}"
         "{%- elif message['content'] is iterable -%}"
         "{%- for item in message['content'] -%}"
-        "{%- if item['type'] == 'image' -%}"
+        "{%- if item['type'] == 'image_url' -%}"
         "{{ '<start_of_image>' }}"
         "{%- elif item['type'] == 'text' -%}"
         "{{ item['text'] | trim }}"
@@ -3449,21 +3449,6 @@ def split_text_on_image_urls(text: str, image_urls: List[str]):
                 remaining = ""
         return split_text
 
-    @staticmethod
-    def get_image_urls(messages: List[llama_types.ChatCompletionRequestMessage]):
-        image_urls: List[str] = []
-        for message in messages:
-            if message["role"] == "user":
-                if message.get("content") is None:
-                    continue
-                for content in message["content"]:
-                    if isinstance(content, dict) and content.get("type") == "image":
-                        if isinstance(content.get("image"), dict) and isinstance(content["image"].get("url"), str):
-                            image_urls.append(content["image"]["url"])
-                        elif isinstance(content.get("url"), str):
-                            image_urls.append(content["url"])
-        return image_urls
-
     def eval_image(self, llama: llama.Llama, image_url: str):
         import llama_cpp
 
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
index 46ac5087f..8a382b4d9 100644
--- a/llama_cpp/llava_cpp.py
+++ b/llama_cpp/llava_cpp.py
@@ -245,7 +245,8 @@ def clip_image_batch_encode(
     ctx: clip_ctx_p,
     n_threads: c_int,
     imgs: "_Pointer[clip_image_f32_batch]",
-    vec: c_void_p
+    vec: c_void_p,
+    /,
 ) -> bool:
     ...
 

From 8f21c9000425b4ebf437c6c95bae5e1c53d53173 Mon Sep 17 00:00:00 2001
From: bot08 <71845954+bot08@users.noreply.github.com>
Date: Mon, 7 Apr 2025 22:43:17 +0300
Subject: [PATCH 9/9] Update README.md

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index e00456580..3e5c22617 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,20 @@
+# Example installing from this fork with CUDA support (PowerShell)
+
+```powershell
+$env:CMAKE_ARGS = "-DGGML_CUDA=on"
+pip install git+https://github.com/bot08/llama-cpp-python.git@main
+```
+
+To force a clean rebuild:
+
+```powershell
+$env:CMAKE_ARGS = "-DGGML_CUDA=on"
+pip install --upgrade --force-reinstall --no-cache-dir `
+  git+https://github.com/bot08/llama-cpp-python.git@main
+```
+
+---
+
 <p align="center">
   <img src="https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/docs/icon.svg" style="height: 5rem; width: 5rem">
 </p>