Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
466c6cd
server : (experimental) vision support via libmtmd
ngxson Apr 11, 2025
2317e61
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
a46b6db
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
7ac0b7b
mtmd : ability to calc image hash
ngxson Apr 11, 2025
58c4767
shared_ptr for mtmd_image_tokens
ngxson Apr 12, 2025
d3c3e20
move hash to user-define ID (fixed)
ngxson Apr 12, 2025
a44029a
Merge branch 'xsn/mtmd_image_api' into xsn/server_mtmd
ngxson Apr 13, 2025
5e6c7ba
abstract out the batch management
ngxson Apr 13, 2025
78a76de
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 14, 2025
c734b53
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 21, 2025
a6a3653
small fix
ngxson Apr 21, 2025
f8bc466
refactor logic adding tokens to batch
ngxson Apr 21, 2025
f5420e1
implement hashing image
ngxson Apr 21, 2025
aae2e69
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 23, 2025
cd11585
use FNV hash, now hash bitmap instead of file data
ngxson Apr 23, 2025
8afa952
allow decoding image embedding to be split into batches
ngxson Apr 23, 2025
989730c
rm whitespace
ngxson Apr 23, 2025
19b9fe1
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 24, 2025
2df8c1a
disable some features when mtmd is on
ngxson Apr 24, 2025
b9ef895
fix --no-mmproj-offload
ngxson Apr 25, 2025
add9e21
mtmd_context_params no timings
ngxson Apr 25, 2025
0f39770
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 25, 2025
58100b3
refactor server_inp to server_tokens
ngxson Apr 25, 2025
e82fea8
fix the failing test case
ngxson Apr 25, 2025
4a4f35c
init
ngxson Apr 29, 2025
f6b6517
wip
ngxson Apr 29, 2025
e0806c2
Merge branch 'master' into xsn/mtmd_c_api
ngxson Apr 29, 2025
82f4246
working version
ngxson Apr 29, 2025
f8c27b9
add mtmd::bitmaps
ngxson Apr 29, 2025
3357961
add test target
ngxson Apr 29, 2025
92d2404
rm redundant define
ngxson Apr 29, 2025
111d5af
test: mtmd_input_chunks_free
ngxson Apr 29, 2025
08d0f9c
rm outdated comment
ngxson Apr 29, 2025
a230804
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 2, 2025
863db31
fix merging issue
ngxson May 2, 2025
a0fb701
explicitly create mtmd::input_chunks
ngxson May 2, 2025
6bc7a30
mtmd_input_chunk_copy
ngxson May 2, 2025
4d842eb
add clone()
ngxson May 2, 2025
f91fb97
Merge branch 'master' into xsn/server_mtmd
ngxson May 3, 2025
2cedd18
improve server_input struct
ngxson May 3, 2025
3ee071c
clip : fix confused naming ffn_up and ffn_down
ngxson May 3, 2025
3fbf0bd
rm ffn_i/o/g naming
ngxson May 3, 2025
f3870a6
rename n_embd, n_ff
ngxson May 3, 2025
ae83229
small fix
ngxson May 3, 2025
0009f76
Merge branch 'master' into xsn/clip_ffn_up_down_fix
ngxson May 3, 2025
246a4e0
no check n_ff
ngxson May 3, 2025
57b288f
Merge branch 'xsn/clip_ffn_up_down_fix' into xsn/server_mtmd
ngxson May 3, 2025
5f1fe1b
fix detokenize
ngxson May 3, 2025
06cb595
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 4, 2025
e9f7ff9
add const to various places
ngxson May 4, 2025
049ae24
add warning about breaking changes
ngxson May 4, 2025
91613c0
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
d3fece5
add c api
ngxson May 4, 2025
076e3b9
helper: use mtmd_image_tokens_get_n_pos
ngxson May 4, 2025
574d403
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
036f682
Merge branch 'master' into xsn/server_mtmd
ngxson May 4, 2025
01c623e
fix ctx_shift
ngxson May 4, 2025
a0f2562
fix name shadowing
ngxson May 4, 2025
9149f39
Merge branch 'master' into xsn/server_mtmd
ngxson May 5, 2025
b353038
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
3304b44
more strict condition
ngxson May 6, 2025
88461f2
support remote image_url
ngxson May 6, 2025
4adce86
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
a9b21f4
remote image_url log
ngxson May 6, 2025
2f30530
add CI test
ngxson May 6, 2025
5ffde38
do not log base64
ngxson May 6, 2025
aaebc33
add "has_multimodal" to /props
ngxson May 8, 2025
eeda075
remove dangling image
ngxson May 8, 2025
bef122e
speculative: use slot.cache_tokens.insert
ngxson May 8, 2025
7282456
Merge branch 'master' into xsn/server_mtmd
ngxson May 8, 2025
51afc0a
Apply suggestions from code review
ngxson May 9, 2025
f10fc56
rm can_be_detokenized
ngxson May 9, 2025
689035c
on prmpt processing done, assert cache_tokens.size
ngxson May 9, 2025
b2906a9
handle_completions_impl returns void
ngxson May 9, 2025
abfd821
Merge branch 'master' into xsn/server_mtmd
ngxson May 9, 2025
f5fbc03
adapt the new web ui
ngxson May 9, 2025
5fe8d72
update docs and hot topics
ngxson May 9, 2025
b8000fd
rm assert
ngxson May 9, 2025
9ed430c
small fix (2)
ngxson May 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
add c api
  • Loading branch information
@ngxson
ngxson committed May 4, 2025
commit d3fece5994fac243758491902406b22d371e1645
44 changes: 26 additions & 18 deletions tools/server/server.cpp
Original file line numberDiff line numberDiff line change
Expand Up@@ -1974,12 +1974,11 @@ struct server_context{

std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()){
mtmd_context_params mparams{
/* use_gpu */ params_base.mmproj_use_gpu,
/* timings */ false,
/* n_threads */ params_base.cpuparams.n_threads,
/* verbosity */ params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
};
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params_base.mmproj_use_gpu;
mparams.print_timings = false;
mparams.n_threads = params_base.cpuparams.n_threads;
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
if (mctx == nullptr){
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());
Expand DownExpand Up@@ -3214,8 +3213,10 @@ struct server_context{
// check if we should process the image
if (cur_tok == LLAMA_TOKEN_NULL){
// process the image
int32_t n_pos = slot.n_past;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, n_pos);
int32_t new_n_past;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
int32_t n_pos = new_n_past - slot.n_past;

if (res != 0){
SLT_ERR(slot, "failed to process image, res = %d\n", res);
slot.release();
Expand All@@ -3224,7 +3225,8 @@ struct server_context{
}

if (slot.params.cache_prompt){
slot.prompt_tokens.move_chunk(slot.cache_tokens, slot.n_past);
const auto & chunk = slot.prompt_tokens.find_chunk(slot.n_past);
slot.cache_tokens.push_back(chunk.get()); // copy
}

slot.n_past += n_pos;
Expand DownExpand Up@@ -4073,21 +4075,21 @@ int main(int argc, char ** argv){
//SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());

// process files
std::vector<mtmd_bitmap> bitmaps;
mtmd::bitmaps bitmaps;
const bool has_mtmd = ctx_server.mctx != nullptr;
{
if (!has_mtmd && !files.empty()){
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files){
mtmd_bitmap bmp;
int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
if (res != 0){
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
if (!bmp.ptr){
throw std::runtime_error("Failed to load image");
}
// calculate bitmap hash (for KV caching)
bmp.id = fnv_hash(bmp.data.data(), bmp.data.size());
bitmaps.push_back(std::move(bmp));
std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
bmp.set_id(hash.c_str());
bitmaps.entries.push_back(std::move(bmp));
}
}

Expand All@@ -4098,13 +4100,19 @@ int main(int argc, char ** argv){

} else if (oaicompat && has_mtmd){
// multimodal
std::string prompt_str = prompt.get<std::string>();
mtmd_input_text inp_txt ={
prompt.get<std::string>(),
prompt_str.c_str(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd_input_chunks chunks;
int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
mtmd::input_chunks chunks(mtmd_input_chunks_init());
auto bitmaps_c_ptr = bitmaps.c_ptr();
int32_t tokenized = mtmd_tokenize(ctx_server.mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps_c_ptr.data(),
bitmaps_c_ptr.size());
if (tokenized != 0){
throw std::runtime_error("Failed to tokenize prompt");
}
Expand Down
94 changes: 43 additions & 51 deletions tools/server/utils.hpp
Original file line numberDiff line numberDiff line change
Expand Up@@ -998,7 +998,7 @@ struct server_tokens{
private: // disallow accessing these members directly, risking out-of-sync

// map a **start** position in tokens to the image chunk
std::unordered_map<llama_pos, mtmd_input_chunk> map_pos_to_image;
std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;

// list of tokens
// it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
Expand DownExpand Up@@ -1027,9 +1027,9 @@ struct server_tokens{
llama_token operator[](size_t index){return tokens[index]}
const llama_token& operator[](size_t index) const{return tokens[index]}

server_tokens(mtmd_input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd){
for (auto & c : mtmd_chunks){
push_back(std::move(c));
server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd){
for (size_t i = 0; i < mtmd_chunks.size(); ++i){
push_back(mtmd_chunks[i]);
}
}

Expand All@@ -1054,7 +1054,7 @@ struct server_tokens{
return oss.str();
}

const mtmd_input_chunk & find_chunk(llama_pos pos) const{
const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const{
auto it = map_pos_to_image.find(pos);
if (it != map_pos_to_image.end()){
return it->second;
Expand All@@ -1070,35 +1070,31 @@ struct server_tokens{
tokens.emplace_back(tok);
}

void push_back(mtmd_input_chunk && chunk){
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
// will create a copy of the chunk if it contains non-text data
void push_back(const mtmd_input_chunk * chunk){
auto type = mtmd_input_chunk_get_type(chunk);
if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE){
GGML_ASSERT(has_mtmd);
GGML_ASSERT(chunk.tokens_image != nullptr);
const int n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
llama_pos start_pos = tokens.size();
printf("start_pos = %d, n_pos = %d\n", start_pos, n_pos);
for (int i = 0; i < n_pos; ++i){
tokens.emplace_back(LLAMA_TOKEN_NULL);
}
// TODO: use mtmd_input_chunk_copy when the C API is ready
map_pos_to_image[start_pos] = std::move(chunk);
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT){
for (auto & tok : chunk.tokens_text){
push_back(tok);
mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
map_pos_to_image[start_pos] = std::move(new_chunk);
} else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT){
size_t n_tokens;
auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
for (size_t i = 0; i < n_tokens; ++i){
push_back(text_tokens[i]);
}
} else{
GGML_ABORT("Invalid chunk type");
}
}

// TODO: use mtmd_input_chunk_copy when the C API is ready
void move_chunk(server_tokens & dst, llama_pos pos){
auto it = map_pos_to_image.find(pos);
if (it == map_pos_to_image.end()){
throw std::runtime_error("Chunk not found");
}
dst.push_back(std::move(it->second));
}

void insert(llama_tokens & tokens){
tokens.insert(tokens.end(), tokens.begin(), tokens.end());
}
Expand All@@ -1116,6 +1112,7 @@ struct server_tokens{
}

void resize(size_t n){
GGML_ASSERT(n <= tokens.size());
// we throw an error if we try to remove a token in the middle of an image
// for ex. with input of 5 text tokens and 2 images:
// [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
Expand DownExpand Up@@ -1164,12 +1161,16 @@ struct server_tokens{
GGML_ASSERT(has_mtmd);
const auto & a_chunk = find_chunk(i);
const auto & b_chunk = b.find_chunk(i);
std::string ai_id = mtmd_image_tokens_get_id(a_chunk.tokens_image.get());
std::string bi_id = mtmd_image_tokens_get_id(b_chunk.tokens_image.get());
if (ai_id == bi_id){
size_t n_pos = mtmd_image_tokens_get_n_pos(a_chunk.tokens_image.get());
GGML_ASSERT(n_pos > 0 && "Invalid image token"); // should never happen
i += n_pos - 1; // will be +1 by the for loop
GGML_ASSERT(a_chunk && b_chunk);
const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
std::string ai_id = mtmd_image_tokens_get_id(a_img);
std::string bi_id = mtmd_image_tokens_get_id(b_img);
size_t a_pos = mtmd_image_tokens_get_n_pos(a_img);
size_t b_pos = mtmd_image_tokens_get_n_pos(b_img);
if (ai_id == bi_id && a_pos == b_pos){
GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
i += a_pos - 1; // will be +1 by the for loop
continue;
} else{
return i;
Expand All@@ -1190,7 +1191,8 @@ struct server_tokens{
if (t == LLAMA_TOKEN_NULL){
try{
const auto & chunk = find_chunk(i);
size_t n_pos = mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
i += n_pos - 1; // will be +1 by the for loop
} catch (const std::exception & e){
return false;
Expand All@@ -1202,7 +1204,7 @@ struct server_tokens{
return true;
}

// TODO: (IMPORTANT) this is hacky ; use mtmd helper when C API is ready
// encode and decode the image chunk
int32_t process_chunk(
llama_context * ctx,
mtmd_context * mctx,
Expand All@@ -1213,34 +1215,24 @@ struct server_tokens{
if (it == map_pos_to_image.end()){
throw std::runtime_error("Chunk not found");
}
size_t n_pos = mtmd_image_tokens_get_n_pos(it->second.tokens_image.get());
mtmd_input_chunks chunks;
{
mtmd_input_chunk chunk0{
/* type */ MTMD_INPUT_CHUNK_TYPE_IMAGE,
/* tokens_text */{},
/* tokens_image */ std::move(it->second.tokens_image), // move it back later
};
mtmd_input_chunk chunk1{
/* type */ MTMD_INPUT_CHUNK_TYPE_TEXT,
/* tokens_text */{},
/* tokens_image */ nullptr,
};
chunks.emplace_back(std::move(chunk0));
chunks.emplace_back(std::move(chunk1));
}
SRV_INF("%s\n", "processing image...");
int32_t n_batch = llama_n_batch(ctx);
int64_t t0 = ggml_time_ms();
int32_t result = mtmd_helper_eval(mctx, ctx, chunks, n_past, seq_id, n_batch);
llama_pos new_n_past = n_past;
int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
it->second.get(), // chunk
n_past,
seq_id,
n_batch,
true, // logits last
&new_n_past);
SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
it->second.tokens_image = std::move(chunks[0].tokens_image);
if (result != 0){
LOG_ERR("mtmd_helper_eval failed with status %d", result);
n_pos_out = 0;
n_pos_out = n_past;
return result;
}
n_pos_out = n_pos;
n_pos_out = new_n_past;
return 0;
}
};
Expand Down