ggml-org · ngxson · May 9, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -162,6 +162,10 @@ if (NOT GGML_BACKEND_DL)
  llama_build_and_test(test-rope.cpp)
 endif()
 
+# libmtmd
+set(LLAMA_TEST_NAME test-mtmd-c-api)
+llama_build_and_test(test-mtmd-c-api.c)
+target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)

diff --git a/tests/test-mtmd-c-api.c b/tests/test-mtmd-c-api.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <assert.h>
+
+#include "mtmd.h"
+
+int main(void){
+ printf("\n\nTesting libmtmd C API...\n");
+ printf("--------\n\n");
+
+ struct mtmd_context_params params = mtmd_context_params_default();
+ printf("Default image marker: %s\n", params.image_marker);
+
+ mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();
+
+ if (!chunks){
+ fprintf(stderr, "Failed to create input chunks\n");
+ return 1;
+ }
+
+ size_t n_chunks = mtmd_input_chunks_size(chunks);
+ printf("Number of chunks: %zu\n", n_chunks);
+ assert(n_chunks > 0);
+
+ for (size_t i = 0; i < n_chunks; i++){
+ const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
+ assert(chunk != NULL);
+ enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
+ printf("Chunk %zu type: %d\n", i, type);
+
+ if (type == MTMD_INPUT_CHUNK_TYPE_TEXT){
+ size_t n_tokens;
+ const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+ printf(" Text chunk with %zu tokens\n", n_tokens);
+ assert(tokens != NULL);
+ assert(n_tokens > 0);
+ for (size_t j = 0; j < n_tokens; j++){
+ assert(tokens[j] >= 0);
+ printf(" > Token %zu: %d\n", j, tokens[j]);
+ }
+
+ } else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
+ const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+ size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+ size_t nx = mtmd_image_tokens_get_nx(image_tokens);
+ size_t ny = mtmd_image_tokens_get_ny(image_tokens);
+ const char * id = mtmd_image_tokens_get_id(image_tokens);
+ assert(n_tokens > 0);
+ assert(nx > 0);
+ assert(ny > 0);
+ assert(id != NULL);
+ printf(" Image chunk with %zu tokens\n", n_tokens);
+ printf(" Image size: %zu x %zu\n", nx, ny);
+ printf(" Image ID: %s\n", id);
+ }
+ }
+
+ // Free the chunks
+ mtmd_input_chunks_free(chunks);
+
+ printf("\n\nDONE: test libmtmd C API...\n");
+
+ return 0;
+}
diff --git a/tools/llava/clip-impl.h b/tools/llava/clip-impl.h
@@ -233,6 +233,15 @@ struct clip_image_u8_batch{
 
 struct clip_image_f32_batch{
  std::vector<clip_image_f32_ptr> entries;
+
+ clip_image_f32_batch clone() const{
+ clip_image_f32_batch new_batch;
+ new_batch.entries.reserve(entries.size());
+ for (const auto & entry : entries){
+ new_batch.entries.emplace_back(new clip_image_f32(*entry));
+ }
+ return new_batch;
+ }
 };
 
 //

diff --git a/tools/llava/clip.h b/tools/llava/clip.h
@@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
 
-CLIP_API struct clip_image_size * clip_image_size_init();
-CLIP_API struct clip_image_u8 * clip_image_u8_init ();
-CLIP_API struct clip_image_f32 * clip_image_f32_init();
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
+CLIP_API struct clip_image_size * clip_image_size_init(void);
+CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
+CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
+CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
 
 // nx, ny are the output image dimensions
 CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);

diff --git a/tools/llava/mtmd-cli.cpp b/tools/llava/mtmd-cli.cpp
@@ -63,7 +63,7 @@ static void sigint_handler(int signo){
 #endif
 
 struct mtmd_cli_context{
-mtmd_context_ptr ctx_vision;
+mtmd::context_ptr ctx_vision;
  common_init_result llama_init;
 
  llama_model * model;
@@ -72,7 +72,7 @@ struct mtmd_cli_context{
  llama_batch batch;
  int n_batch;
 
-std::vector<mtmd_bitmap> bitmaps;
+mtmd::bitmaps bitmaps;
 
  // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
  // so here we don't need to keep track of chat history
@@ -119,12 +119,12 @@ struct mtmd_cli_context{
 
  void init_vision_context(common_params & params){
  const char * clip_path = params.mmproj.path.c_str();
-ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
- /* use_gpu */ params.mmproj_use_gpu,
- /* timings */ true,
- /* n_threads */ params.cpuparams.n_threads,
- /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
-}));
+mtmd_context_params mparams = mtmd_context_params_default();
+mparams.use_gpu = params.mmproj_use_gpu;
+mparams.print_timings = true;
+mparams.n_threads = params.cpuparams.n_threads;
+mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
  if (!ctx_vision.get()){
  LOG_ERR("Failed to load vision model from %s\n", clip_path);
  exit(1);
@@ -143,11 +143,11 @@ struct mtmd_cli_context{
  }
 
  bool load_image(const std::string & fname){
-mtmd_bitmap bitmap;
- if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)){
+mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
+ if (!bmp.ptr){
  return false;
  }
- bitmaps.push_back(std::move(bitmap));
+ bitmaps.entries.push_back(std::move(bmp));
  return true;
  }
 };
@@ -197,27 +197,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
  LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
 
  mtmd_input_text text;
- text.text = formatted_chat.prompt;
+ text.text = formatted_chat.prompt.c_str();
  text.add_special = add_bos;
  text.parse_special = true;
- mtmd_input_chunks chunks;
 
  if (g_is_interrupted) return 0;
 
- int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, ctx.bitmaps);
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
+ auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
+ int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
+ chunks.ptr.get(), // output
+ &text, // text
+ bitmaps_c_ptr.data(),
+ bitmaps_c_ptr.size());
  if (res != 0){
  LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
  return 1;
  }
 
- ctx.bitmaps.clear();
-
- if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)){
+ ctx.bitmaps.entries.clear();
+
+ llama_pos new_n_past;
+ if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
+ ctx.lctx, // lctx
+ chunks.ptr.get(), // chunks
+ ctx.n_past, // n_past
+ 0, // seq_id
+ ctx.n_batch, // n_batch
+ true, // logits_last
+ &new_n_past)){
  LOG_ERR("Unable to eval prompt\n");
  return 1;
  }
 
- ctx.n_past += mtmd_helper_get_n_pos(chunks);
+ ctx.n_past = new_n_past;
 
  LOG("\n");
 
@@ -250,7 +263,7 @@ int main(int argc, char ** argv){
  struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
  int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;
 
- // ctrl+C handling
+ // Ctrl+C handling
 {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
  struct sigaction sigint_action;