add c api

ggml-org · ngxson · May 9, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
commit d3fece5994fac243758491902406b22d371e1645
@@ -1974,12 +1974,11 @@ struct server_context{
 
  std::string & mmproj_path = params_base.mmproj.path;
  if (!mmproj_path.empty()){
- mtmd_context_params mparams{
- /* use_gpu */ params_base.mmproj_use_gpu,
- /* timings */ false,
- /* n_threads */ params_base.cpuparams.n_threads,
- /* verbosity */ params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
- };
+ mtmd_context_params mparams = mtmd_context_params_default();
+ mparams.use_gpu = params_base.mmproj_use_gpu;
+ mparams.print_timings = false;
+ mparams.n_threads = params_base.cpuparams.n_threads;
+ mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
  mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
  if (mctx == nullptr){
  SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
@@ -3214,8 +3213,10 @@ struct server_context{
  // check if we should process the image
  if (cur_tok == LLAMA_TOKEN_NULL){
  // process the image
- int32_t n_pos = slot.n_past;
- int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, n_pos);
+ int32_t new_n_past;
+ int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
+ int32_t n_pos = new_n_past - slot.n_past;
+
  if (res != 0){
  SLT_ERR(slot, "failed to process image, res = %d\n", res);
  slot.release();
@@ -3224,7 +3225,8 @@ struct server_context{
  }
 
  if (slot.params.cache_prompt){
- slot.prompt_tokens.move_chunk(slot.cache_tokens, slot.n_past);
+ const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
+ slot.cache_tokens.push_back(chunk.get()); // copy
  }
 
  slot.n_past += n_pos;
@@ -4073,21 +4075,21 @@ int main(int argc, char ** argv){
  //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
 
  // process files
-std::vector<mtmd_bitmap> bitmaps;
+mtmd::bitmaps bitmaps;
  const bool has_mtmd = ctx_server.mctx != nullptr;
 {
  if (!has_mtmd && !files.empty()){
  throw std::runtime_error("This server does not support multimodal");
  }
  for (auto & file : files){
- mtmd_bitmap bmp;
- int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
- if (res != 0){
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
+ if (!bmp.ptr){
  throw std::runtime_error("Failed to load image");
  }
  // calculate bitmap hash (for KV caching)
- bmp.id = fnv_hash(bmp.data.data(), bmp.data.size());
- bitmaps.push_back(std::move(bmp));
+ std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
+ bmp.set_id(hash.c_str());
+ bitmaps.entries.push_back(std::move(bmp));
  }
  }
 
@@ -4098,13 +4100,19 @@ int main(int argc, char ** argv){
 
  } else if (oaicompat && has_mtmd){
  // multimodal
+ std::string prompt_str = prompt.get<std::string>();
  mtmd_input_text inp_txt ={
-prompt.get<std::string>(),
+prompt_str.c_str(),
  /* add_special */ true,
  /* parse_special */ true,
  };
- mtmd_input_chunks chunks;
- int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
+ mtmd::input_chunks chunks(mtmd_input_chunks_init());
+ auto bitmaps_c_ptr = bitmaps.c_ptr();
+ int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
+ chunks.ptr.get(),
+ &inp_txt,
+ bitmaps_c_ptr.data(),
+ bitmaps_c_ptr.size());
  if (tokenized != 0){
  throw std::runtime_error("Failed to tokenize prompt");
  }

@@ -998,7 +998,7 @@ struct server_tokens{
 private: // disallow accessing these members directly, risking out-of-sync
 
  // map a **start** position in tokens to the image chunk
- std::unordered_map<llama_pos, mtmd_input_chunk> map_pos_to_image;
+ std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
 
  // list of tokens
  // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@@ -1027,9 +1027,9 @@ struct server_tokens{
  llama_token operator[](size_t index){return tokens[index]}
  const llama_token& operator[](size_t index) const{return tokens[index]}
 
- server_tokens(mtmd_input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd){
- for (auto & c : mtmd_chunks){
- push_back(std::move(c));
+ server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd){
+ for (size_t i = 0; i < mtmd_chunks.size(); ++i){
+ push_back(mtmd_chunks[i]);
  }
  }
 
@@ -1054,7 +1054,7 @@ struct server_tokens{
  return oss.str();
  }
 
- const mtmd_input_chunk & find_chunk(llama_pos pos) const{
+ const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const{
  auto it = map_pos_to_image.find(pos);
  if (it != map_pos_to_image.end()){
  return it->second;
@@ -1070,35 +1070,31 @@ struct server_tokens{
  tokens.emplace_back(tok);
  }
 
- void push_back(mtmd_input_chunk && chunk){
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
+ // will create a copy of the chunk if it contains non-text data
+ void push_back(const mtmd_input_chunk * chunk){
+ auto type = mtmd_input_chunk_get_type(chunk);
+ if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
  GGML_ASSERT(has_mtmd);
-GGML_ASSERT(chunk.tokens_image != nullptr);
- const int n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+ const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
  llama_pos start_pos = tokens.size();
+ printf("start_pos = %d, n_pos = %d\n", start_pos, n_pos);
  for (int i = 0; i < n_pos; ++i){
  tokens.emplace_back(LLAMA_TOKEN_NULL);
  }
- // TODO: use mtmd_input_chunk_copy when the C API is ready
- map_pos_to_image[start_pos] = std::move(chunk);
- } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT){
- for (auto & tok : chunk.tokens_text){
- push_back(tok);
+ mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
+ map_pos_to_image[start_pos] = std::move(new_chunk);
+ } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT){
+ size_t n_tokens;
+ auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
+ for (size_t i = 0; i < n_tokens; ++i){
+ push_back(text_tokens[i]);
  }
  } else{
  GGML_ABORT("Invalid chunk type");
  }
  }
 
- // TODO: use mtmd_input_chunk_copy when the C API is ready
- void move_chunk(server_tokens & dst, llama_pos pos){
- auto it = map_pos_to_image.find(pos);
- if (it == map_pos_to_image.end()){
- throw std::runtime_error("Chunk not found");
- }
- dst.push_back(std::move(it->second));
- }
-
  void insert(llama_tokens & tokens){
  tokens.insert(tokens.end(), tokens.begin(), tokens.end());
  }
@@ -1116,6 +1112,7 @@ struct server_tokens{
  }
 
  void resize(size_t n){
+ GGML_ASSERT(n <= tokens.size());
  // we throw an error if we try to remove a token in the middle of an image
  // for ex. with input of 5 text tokens and 2 images:
  // [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@@ -1164,12 +1161,16 @@ struct server_tokens{
  GGML_ASSERT(has_mtmd);
  const auto & a_chunk = find_chunk(i);
  const auto & b_chunk = b.find_chunk(i);
- std::string ai_id = mtmd_image_tokens_get_id(a_chunk.tokens_image.get());
- std::string bi_id = mtmd_image_tokens_get_id(b_chunk.tokens_image.get());
- if (ai_id == bi_id){
- size_t n_pos = mtmd_image_tokens_get_n_pos(a_chunk.tokens_image.get());
- GGML_ASSERT(n_pos > 0 && "Invalid image token"); // should never happen
- i += n_pos - 1; // will be +1 by the for loop
+ GGML_ASSERT(a_chunk && b_chunk);
+ const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
+ const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
+ std::string ai_id = mtmd_image_tokens_get_id(a_img);
+ std::string bi_id = mtmd_image_tokens_get_id(b_img);
+ size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
+ size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
+ if (ai_id == bi_id && a_pos == b_pos){
+ GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+ i += a_pos - 1; // will be +1 by the for loop
  continue;
  } else{
  return i;
@@ -1190,7 +1191,8 @@ struct server_tokens{
  if (t == LLAMA_TOKEN_NULL){
  try{
  const auto & chunk = find_chunk(i);
- size_t n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+ const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
+ size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
  i += n_pos - 1; // will be +1 by the for loop
  } catch (const std::exception & e){
  return false;
@@ -1202,7 +1204,7 @@ struct server_tokens{
  return true;
  }
 
- // TODO: (IMPORTANT) this is hacky ; use mtmd helper when C API is ready
+ // encode and decode the image chunk
  int32_t process_chunk(
  llama_context * ctx,
  mtmd_context * mctx,
@@ -1213,34 +1215,24 @@ struct server_tokens{
  if (it == map_pos_to_image.end()){
  throw std::runtime_error("Chunk not found");
  }
- size_t n_pos = mtmd_image_tokens_get_n_pos(it->second.tokens_image.get());
- mtmd_input_chunks chunks;
-{
- mtmd_input_chunk chunk0{
- /* type */ MTMD_INPUT_CHUNK_TYPE_IMAGE,
- /* tokens_text */{},
- /* tokens_image */ std::move(it->second.tokens_image), // move it back later
- };
- mtmd_input_chunk chunk1{
- /* type */ MTMD_INPUT_CHUNK_TYPE_TEXT,
- /* tokens_text */{},
- /* tokens_image */ nullptr,
- };
- chunks.emplace_back(std::move(chunk0));
- chunks.emplace_back(std::move(chunk1));
- }
  SRV_INF("%s\n", "processing image...");
  int32_t n_batch = llama_n_batch(ctx);
  int64_t t0 = ggml_time_ms();
- int32_t result = mtmd_helper_eval(mctx, ctx, chunks, n_past, seq_id, n_batch);
+ llama_pos new_n_past = n_past;
+ int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
+ it->second.get(), // chunk
+ n_past,
+ seq_id,
+ n_batch,
+ true, // logits last
+ &new_n_past);
  SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
- it->second.tokens_image = std::move(chunks[0].tokens_image);
  if (result != 0){
  LOG_ERR("mtmd_helper_eval failed with status %d", result);
- n_pos_out = 0;
+ n_pos_out = n_past;
  return result;
  }
- n_pos_out = n_pos;
+ n_pos_out = new_n_past;
  return 0;
  }
 };