Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
466c6cd
server : (experimental) vision support via libmtmd
ngxson Apr 11, 2025
2317e61
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
a46b6db
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
7ac0b7b
mtmd : ability to calc image hash
ngxson Apr 11, 2025
58c4767
shared_ptr for mtmd_image_tokens
ngxson Apr 12, 2025
d3c3e20
move hash to user-define ID (fixed)
ngxson Apr 12, 2025
a44029a
Merge branch 'xsn/mtmd_image_api' into xsn/server_mtmd
ngxson Apr 13, 2025
5e6c7ba
abstract out the batch management
ngxson Apr 13, 2025
78a76de
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 14, 2025
c734b53
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 21, 2025
a6a3653
small fix
ngxson Apr 21, 2025
f8bc466
refactor logic adding tokens to batch
ngxson Apr 21, 2025
f5420e1
implement hashing image
ngxson Apr 21, 2025
aae2e69
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 23, 2025
cd11585
use FNV hash, now hash bitmap instead of file data
ngxson Apr 23, 2025
8afa952
allow decoding image embedding to be split into batches
ngxson Apr 23, 2025
989730c
rm whitespace
ngxson Apr 23, 2025
19b9fe1
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 24, 2025
2df8c1a
disable some features when mtmd is on
ngxson Apr 24, 2025
b9ef895
fix --no-mmproj-offload
ngxson Apr 25, 2025
add9e21
mtmd_context_params no timings
ngxson Apr 25, 2025
0f39770
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 25, 2025
58100b3
refactor server_inp to server_tokens
ngxson Apr 25, 2025
e82fea8
fix the failing test case
ngxson Apr 25, 2025
4a4f35c
init
ngxson Apr 29, 2025
f6b6517
wip
ngxson Apr 29, 2025
e0806c2
Merge branch 'master' into xsn/mtmd_c_api
ngxson Apr 29, 2025
82f4246
working version
ngxson Apr 29, 2025
f8c27b9
add mtmd::bitmaps
ngxson Apr 29, 2025
3357961
add test target
ngxson Apr 29, 2025
92d2404
rm redundant define
ngxson Apr 29, 2025
111d5af
test: mtmd_input_chunks_free
ngxson Apr 29, 2025
08d0f9c
rm outdated comment
ngxson Apr 29, 2025
a230804
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 2, 2025
863db31
fix merging issue
ngxson May 2, 2025
a0fb701
explicitly create mtmd::input_chunks
ngxson May 2, 2025
6bc7a30
mtmd_input_chunk_copy
ngxson May 2, 2025
4d842eb
add clone()
ngxson May 2, 2025
f91fb97
Merge branch 'master' into xsn/server_mtmd
ngxson May 3, 2025
2cedd18
improve server_input struct
ngxson May 3, 2025
3ee071c
clip : fix confused naming ffn_up and ffn_down
ngxson May 3, 2025
3fbf0bd
rm ffn_i/o/g naming
ngxson May 3, 2025
f3870a6
rename n_embd, n_ff
ngxson May 3, 2025
ae83229
small fix
ngxson May 3, 2025
0009f76
Merge branch 'master' into xsn/clip_ffn_up_down_fix
ngxson May 3, 2025
246a4e0
no check n_ff
ngxson May 3, 2025
57b288f
Merge branch 'xsn/clip_ffn_up_down_fix' into xsn/server_mtmd
ngxson May 3, 2025
5f1fe1b
fix detokenize
ngxson May 3, 2025
06cb595
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 4, 2025
e9f7ff9
add const to various places
ngxson May 4, 2025
049ae24
add warning about breaking changes
ngxson May 4, 2025
91613c0
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
d3fece5
add c api
ngxson May 4, 2025
076e3b9
helper: use mtmd_image_tokens_get_n_pos
ngxson May 4, 2025
574d403
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
036f682
Merge branch 'master' into xsn/server_mtmd
ngxson May 4, 2025
01c623e
fix ctx_shift
ngxson May 4, 2025
a0f2562
fix name shadowing
ngxson May 4, 2025
9149f39
Merge branch 'master' into xsn/server_mtmd
ngxson May 5, 2025
b353038
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
3304b44
more strict condition
ngxson May 6, 2025
88461f2
support remote image_url
ngxson May 6, 2025
4adce86
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
a9b21f4
remote image_url log
ngxson May 6, 2025
2f30530
add CI test
ngxson May 6, 2025
5ffde38
do not log base64
ngxson May 6, 2025
aaebc33
add "has_multimodal" to /props
ngxson May 8, 2025
eeda075
remove dangling image
ngxson May 8, 2025
bef122e
speculative: use slot.cache_tokens.insert
ngxson May 8, 2025
7282456
Merge branch 'master' into xsn/server_mtmd
ngxson May 8, 2025
51afc0a
Apply suggestions from code review
ngxson May 9, 2025
f10fc56
rm can_be_detokenized
ngxson May 9, 2025
689035c
on prmpt processing done, assert cache_tokens.size
ngxson May 9, 2025
b2906a9
handle_completions_impl returns void
ngxson May 9, 2025
abfd821
Merge branch 'master' into xsn/server_mtmd
ngxson May 9, 2025
f5fbc03
adapt the new web ui
ngxson May 9, 2025
5fe8d72
update docs and hot topics
ngxson May 9, 2025
b8000fd
rm assert
ngxson May 9, 2025
9ed430c
small fix (2)
ngxson May 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/CMakeLists.txt
Original file line numberDiff line numberDiff line change
Expand Up@@ -162,6 +162,10 @@ if (NOT GGML_BACKEND_DL)
llama_build_and_test(test-rope.cpp)
endif()

# libmtmd
set(LLAMA_TEST_NAME test-mtmd-c-api)
llama_build_and_test(test-mtmd-c-api.c)
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)

# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
Expand Down
63 changes: 63 additions & 0 deletions tests/test-mtmd-c-api.c
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
#include <stdio.h>
#include <assert.h>

#include "mtmd.h"

int main(void){
printf("\n\nTesting libmtmd C API...\n");
printf("--------\n\n");

struct mtmd_context_params params = mtmd_context_params_default();
printf("Default image marker: %s\n", params.image_marker);

mtmd_input_chunks * chunks = mtmd_test_create_input_chunks();

if (!chunks){
fprintf(stderr, "Failed to create input chunks\n");
return 1;
}

size_t n_chunks = mtmd_input_chunks_size(chunks);
printf("Number of chunks: %zu\n", n_chunks);
assert(n_chunks > 0);

for (size_t i = 0; i < n_chunks; i++){
const mtmd_input_chunk * chunk = mtmd_input_chunks_get(chunks, i);
assert(chunk != NULL);
enum mtmd_input_chunk_type type = mtmd_input_chunk_get_type(chunk);
printf("Chunk %zu type: %d\n", i, type);

if (type == MTMD_INPUT_CHUNK_TYPE_TEXT){
size_t n_tokens;
const llama_token * tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
printf(" Text chunk with %zu tokens\n", n_tokens);
assert(tokens != NULL);
assert(n_tokens > 0);
for (size_t j = 0; j < n_tokens; j++){
assert(tokens[j] >= 0);
printf(" > Token %zu: %d\n", j, tokens[j]);
}

} else if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
const mtmd_image_tokens * image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
size_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
size_t nx = mtmd_image_tokens_get_nx(image_tokens);
size_t ny = mtmd_image_tokens_get_ny(image_tokens);
const char * id = mtmd_image_tokens_get_id(image_tokens);
assert(n_tokens > 0);
assert(nx > 0);
assert(ny > 0);
assert(id != NULL);
printf(" Image chunk with %zu tokens\n", n_tokens);
printf(" Image size: %zu x %zu\n", nx, ny);
printf(" Image ID: %s\n", id);
}
}

// Free the chunks
mtmd_input_chunks_free(chunks);

printf("\n\nDONE: test libmtmd C API...\n");

return 0;
}
9 changes: 9 additions & 0 deletions tools/llava/clip-impl.h
Original file line numberDiff line numberDiff line change
Expand Up@@ -233,6 +233,15 @@ struct clip_image_u8_batch{

struct clip_image_f32_batch{
std::vector<clip_image_f32_ptr> entries;

clip_image_f32_batch clone() const{
clip_image_f32_batch new_batch;
new_batch.entries.reserve(entries.size());
for (const auto & entry : entries){
new_batch.entries.emplace_back(new clip_image_f32(*entry));
}
return new_batch;
}
};

//
Expand Down
8 changes: 4 additions & 4 deletions tools/llava/clip.h
Original file line numberDiff line numberDiff line change
Expand Up@@ -78,10 +78,10 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

CLIP_API struct clip_image_size * clip_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
CLIP_API struct clip_image_size * clip_image_size_init(void);
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava

// nx, ny are the output image dimensions
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
Expand Down
51 changes: 32 additions & 19 deletions tools/llava/mtmd-cli.cpp
Original file line numberDiff line numberDiff line change
Expand Up@@ -63,7 +63,7 @@ static void sigint_handler(int signo){
#endif

struct mtmd_cli_context{
mtmd_context_ptr ctx_vision;
mtmd::context_ptr ctx_vision;
common_init_result llama_init;

llama_model * model;
Expand All@@ -72,7 +72,7 @@ struct mtmd_cli_context{
llama_batch batch;
int n_batch;

std::vector<mtmd_bitmap> bitmaps;
mtmd::bitmaps bitmaps;

// note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
// so here we don't need to keep track of chat history
Expand DownExpand Up@@ -119,12 +119,12 @@ struct mtmd_cli_context{

void init_vision_context(common_params & params){
const char * clip_path = params.mmproj.path.c_str();
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
/* use_gpu */ params.mmproj_use_gpu,
/* timings */ true,
/* n_threads */ params.cpuparams.n_threads,
/* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
}));
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params.mmproj_use_gpu;
mparams.print_timings = true;
mparams.n_threads = params.cpuparams.n_threads;
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
if (!ctx_vision.get()){
LOG_ERR("Failed to load vision model from %s\n", clip_path);
exit(1);
Expand All@@ -143,11 +143,11 @@ struct mtmd_cli_context{
}

bool load_image(const std::string & fname){
mtmd_bitmap bitmap;
if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)){
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
if (!bmp.ptr){
return false;
}
bitmaps.push_back(std::move(bitmap));
bitmaps.entries.push_back(std::move(bmp));
return true;
}
};
Expand DownExpand Up@@ -197,27 +197,40 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, bool add_
LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());

mtmd_input_text text;
text.text = formatted_chat.prompt;
text.text = formatted_chat.prompt.c_str();
text.add_special = add_bos;
text.parse_special = true;
mtmd_input_chunks chunks;

if (g_is_interrupted) return 0;

int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, ctx.bitmaps);
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = ctx.bitmaps.c_ptr();
int32_t res = mtmd_tokenize(ctx.ctx_vision.get(),
chunks.ptr.get(), // output
&text, // text
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (res != 0){
LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
return 1;
}

ctx.bitmaps.clear();

if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks, ctx.n_past, 0, ctx.n_batch)){
ctx.bitmaps.entries.clear();

llama_pos new_n_past;
if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(),
ctx.lctx, // lctx
chunks.ptr.get(), // chunks
ctx.n_past, // n_past
0, // seq_id
ctx.n_batch, // n_batch
true, // logits_last
&new_n_past)){
LOG_ERR("Unable to eval prompt\n");
return 1;
}

ctx.n_past += mtmd_helper_get_n_pos(chunks);
ctx.n_past = new_n_past;

LOG("\n");

Expand DownExpand Up@@ -250,7 +263,7 @@ int main(int argc, char ** argv){
struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

// ctrl+C handling
// Ctrl+C handling
{
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
Expand Down
Loading