Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
79 commits
Select commit Hold shift + click to select a range
466c6cd
server : (experimental) vision support via libmtmd
ngxson Apr 11, 2025
2317e61
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
a46b6db
mtmd : add more api around mtmd_image_tokens
ngxson Apr 11, 2025
7ac0b7b
mtmd : ability to calc image hash
ngxson Apr 11, 2025
58c4767
shared_ptr for mtmd_image_tokens
ngxson Apr 12, 2025
d3c3e20
move hash to user-define ID (fixed)
ngxson Apr 12, 2025
a44029a
Merge branch 'xsn/mtmd_image_api' into xsn/server_mtmd
ngxson Apr 13, 2025
5e6c7ba
abstract out the batch management
ngxson Apr 13, 2025
78a76de
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 14, 2025
c734b53
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 21, 2025
a6a3653
small fix
ngxson Apr 21, 2025
f8bc466
refactor logic adding tokens to batch
ngxson Apr 21, 2025
f5420e1
implement hashing image
ngxson Apr 21, 2025
aae2e69
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 23, 2025
cd11585
use FNV hash, now hash bitmap instead of file data
ngxson Apr 23, 2025
8afa952
allow decoding image embedding to be split into batches
ngxson Apr 23, 2025
989730c
rm whitespace
ngxson Apr 23, 2025
19b9fe1
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 24, 2025
2df8c1a
disable some features when mtmd is on
ngxson Apr 24, 2025
b9ef895
fix --no-mmproj-offload
ngxson Apr 25, 2025
add9e21
mtmd_context_params no timings
ngxson Apr 25, 2025
0f39770
Merge branch 'master' into xsn/server_mtmd
ngxson Apr 25, 2025
58100b3
refactor server_inp to server_tokens
ngxson Apr 25, 2025
e82fea8
fix the failing test case
ngxson Apr 25, 2025
4a4f35c
init
ngxson Apr 29, 2025
f6b6517
wip
ngxson Apr 29, 2025
e0806c2
Merge branch 'master' into xsn/mtmd_c_api
ngxson Apr 29, 2025
82f4246
working version
ngxson Apr 29, 2025
f8c27b9
add mtmd::bitmaps
ngxson Apr 29, 2025
3357961
add test target
ngxson Apr 29, 2025
92d2404
rm redundant define
ngxson Apr 29, 2025
111d5af
test: mtmd_input_chunks_free
ngxson Apr 29, 2025
08d0f9c
rm outdated comment
ngxson Apr 29, 2025
a230804
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 2, 2025
863db31
fix merging issue
ngxson May 2, 2025
a0fb701
explicitly create mtmd::input_chunks
ngxson May 2, 2025
6bc7a30
mtmd_input_chunk_copy
ngxson May 2, 2025
4d842eb
add clone()
ngxson May 2, 2025
f91fb97
Merge branch 'master' into xsn/server_mtmd
ngxson May 3, 2025
2cedd18
improve server_input struct
ngxson May 3, 2025
3ee071c
clip : fix confused naming ffn_up and ffn_down
ngxson May 3, 2025
3fbf0bd
rm ffn_i/o/g naming
ngxson May 3, 2025
f3870a6
rename n_embd, n_ff
ngxson May 3, 2025
ae83229
small fix
ngxson May 3, 2025
0009f76
Merge branch 'master' into xsn/clip_ffn_up_down_fix
ngxson May 3, 2025
246a4e0
no check n_ff
ngxson May 3, 2025
57b288f
Merge branch 'xsn/clip_ffn_up_down_fix' into xsn/server_mtmd
ngxson May 3, 2025
5f1fe1b
fix detokenize
ngxson May 3, 2025
06cb595
Merge branch 'master' into xsn/mtmd_c_api
ngxson May 4, 2025
e9f7ff9
add const to various places
ngxson May 4, 2025
049ae24
add warning about breaking changes
ngxson May 4, 2025
91613c0
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
d3fece5
add c api
ngxson May 4, 2025
076e3b9
helper: use mtmd_image_tokens_get_n_pos
ngxson May 4, 2025
574d403
Merge branch 'xsn/mtmd_c_api' into xsn/server_mtmd
ngxson May 4, 2025
036f682
Merge branch 'master' into xsn/server_mtmd
ngxson May 4, 2025
01c623e
fix ctx_shift
ngxson May 4, 2025
a0f2562
fix name shadowing
ngxson May 4, 2025
9149f39
Merge branch 'master' into xsn/server_mtmd
ngxson May 5, 2025
b353038
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
3304b44
more strict condition
ngxson May 6, 2025
88461f2
support remote image_url
ngxson May 6, 2025
4adce86
Merge branch 'master' into xsn/server_mtmd
ngxson May 6, 2025
a9b21f4
remote image_url log
ngxson May 6, 2025
2f30530
add CI test
ngxson May 6, 2025
5ffde38
do not log base64
ngxson May 6, 2025
aaebc33
add "has_multimodal" to /props
ngxson May 8, 2025
eeda075
remove dangling image
ngxson May 8, 2025
bef122e
speculative: use slot.cache_tokens.insert
ngxson May 8, 2025
7282456
Merge branch 'master' into xsn/server_mtmd
ngxson May 8, 2025
51afc0a
Apply suggestions from code review
ngxson May 9, 2025
f10fc56
rm can_be_detokenized
ngxson May 9, 2025
689035c
on prmpt processing done, assert cache_tokens.size
ngxson May 9, 2025
b2906a9
handle_completions_impl returns void
ngxson May 9, 2025
abfd821
Merge branch 'master' into xsn/server_mtmd
ngxson May 9, 2025
f5fbc03
adapt the new web ui
ngxson May 9, 2025
5fe8d72
update docs and hot topics
ngxson May 9, 2025
b8000fd
rm assert
ngxson May 9, 2025
9ed430c
small fix (2)
ngxson May 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
refactor server_inp to server_tokens
  • Loading branch information
@ngxson
ngxson committed Apr 25, 2025
commit 58100b393d8c288c6f06fb6385d7a1127f1fc753
139 changes: 74 additions & 65 deletions examples/server/server.cpp
Original file line numberDiff line numberDiff line change
Expand Up@@ -198,7 +198,7 @@ struct server_task{

// used by SERVER_TASK_TYPE_INFERENCE
slot_params params;
server_inputs prompt_tokens;
server_tokens prompt_tokens;
int id_selected_slot = -1;

// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
Expand DownExpand Up@@ -1277,14 +1277,14 @@ struct server_slot{
int32_t n_prompt_tokens_processed = 0;

// input prompt tokens
server_inputs prompt_tokens;
server_tokens prompt_tokens;

size_t last_nl_pos = 0;

std::string generated_text;
llama_tokens generated_tokens;

server_inputs cache_tokens;
server_tokens cache_tokens;

std::vector<completion_token_output> generated_token_probs;

Expand DownExpand Up@@ -2020,6 +2020,7 @@ struct server_context{
slot.n_ctx = n_ctx_slot;
slot.n_predict = params_base.n_predict;
slot.mctx = mctx;
slot.cache_tokens.has_mtmd = mctx != nullptr;

if (model_dft){
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
Expand DownExpand Up@@ -2096,7 +2097,7 @@ struct server_context{
int cur_lcs_len = slot.cache_tokens.get_common_prefix(task.prompt_tokens);

// fraction of the common subsequence length compared to the current slot's prompt length
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.n_tokens());
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());

// select the current slot if the criteria match
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity){
Expand DownExpand Up@@ -2135,7 +2136,7 @@ struct server_context{
return ret;
}

bool can_be_detokenized(const struct llama_context * ctx, const server_inputs & inp){
bool can_be_detokenized(const struct llama_context * ctx, const server_tokens & inp){
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
Expand DownExpand Up@@ -2786,7 +2787,7 @@ struct server_context{
break;
}

const size_t token_count = slot->cache_tokens.n_tokens();
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();

std::string filename = task.slot_action.filename;
Expand DownExpand Up@@ -2877,7 +2878,7 @@ struct server_context{
}

// Erase token cache
const size_t n_erased = slot->cache_tokens.n_tokens();
const size_t n_erased = slot->cache_tokens.size();
llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
slot->cache_tokens.clear();

Expand DownExpand Up@@ -2957,11 +2958,11 @@ struct server_context{
llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);

if (slot.params.cache_prompt){
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.chunks.size(); i++){
slot.cache_tokens.chunks[i - n_discard] = std::move(slot.cache_tokens.chunks[i]);
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++){
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
}

slot.cache_tokens.chunks.resize(slot.cache_tokens.chunks.size() - n_discard);
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
}

slot.n_past -= n_discard;
Expand DownExpand Up@@ -3004,7 +3005,7 @@ struct server_context{
}

SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.n_tokens(), slot.truncated);
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
}

// process in chunks of params.n_batch
Expand DownExpand Up@@ -3033,23 +3034,23 @@ struct server_context{
slot.t_start_generation = 0;

slot.n_past = 0;
slot.n_prompt_tokens = prompt_tokens.n_tokens();
slot.n_prompt_tokens = prompt_tokens.size();
slot.state = SLOT_STATE_PROCESSING_PROMPT;

SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);

// print prompt tokens (for debugging)
// if (1){
// // first 16 tokens (avoid flooding logs)
// for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++){
// SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
// }
// } else{
// // all
// for (int i = 0; i < (int) prompt_tokens.size(); i++){
// SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
// }
// }
/*if (1){
// first 16 tokens (avoid flooding logs)
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++){
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
}
} else{
// all
for (int i = 0; i < (int) prompt_tokens.size(); i++){
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
}
}*/

// empty prompt passed -> release the slot and send empty response
if (prompt_tokens.empty()){
Expand DownExpand Up@@ -3113,7 +3114,7 @@ struct server_context{
prompt_tokens.set_text_tokens(new_tokens);

slot.truncated = true;
slot.n_prompt_tokens = prompt_tokens.n_tokens();
slot.n_prompt_tokens = prompt_tokens.size();

SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);

Expand All@@ -3136,13 +3137,13 @@ struct server_context{

SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);

while (head_c < slot.cache_tokens.chunks.size() &&
head_p < prompt_tokens.chunks.size()){
while (head_c < slot.cache_tokens.size() &&
head_p < prompt_tokens.size()){

size_t n_match = 0;
while (head_c + n_match < slot.cache_tokens.chunks.size() &&
head_p + n_match < prompt_tokens.chunks.size() &&
slot.cache_tokens.chunks[head_c + n_match].tok_text == prompt_tokens.chunks[head_p + n_match].tok_text){
while (head_c + n_match < slot.cache_tokens.size() &&
head_p + n_match < prompt_tokens.size() &&
slot.cache_tokens[head_c + n_match].txt == prompt_tokens[head_p + n_match].txt){

n_match++;
}
Expand All@@ -3159,7 +3160,7 @@ struct server_context{
llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);

for (size_t i = 0; i < n_match; i++){
slot.cache_tokens.chunks[head_p + i].tok_text = slot.cache_tokens.chunks[head_c + i].tok_text;
slot.cache_tokens[head_p + i].txt = slot.cache_tokens[head_c + i].txt;
slot.n_past++;
}

Expand DownExpand Up@@ -3207,12 +3208,13 @@ struct server_context{
// remove the non-common part from the cache
slot.cache_tokens.keep_until(slot.n_past);

auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
auto & cur_tok = slot.prompt_tokens[slot.n_past];

// check if we should process the image
if (curr_chunk.tok_image){
if (cur_tok.img){
// process the image
int32_t res = server_img_process(ctx, mctx, curr_chunk, batch_embd, slot.n_past, slot.id);
int32_t res = server_img_process(ctx, mctx, cur_tok, batch_embd, slot.n_past, slot.id);
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(cur_tok.img.get());
if (res != 0){
SLT_ERR(slot, "failed to process image, res = %d\n", res);
slot.release();
Expand All@@ -3221,27 +3223,30 @@ struct server_context{
}

if (slot.params.cache_prompt){
slot.cache_tokens.add_image_tokens(curr_chunk.tok_image);
// all ALL image tokens at once
for (int32_t i = 0; i < n_tokens; i++){
slot.cache_tokens.add_token(std::move(slot.prompt_tokens[slot.n_past + i]));
}
}

slot.n_past += curr_chunk.n_tokens;
slot.n_prompt_tokens_processed += curr_chunk.n_tokens;
slot.n_past += n_tokens;
slot.n_prompt_tokens_processed += n_tokens;
}

// add prompt tokens for processing in the current batch
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch){
// get next token to process
auto & curr_chunk = slot.prompt_tokens.get_chunk(slot.n_past);
if (curr_chunk.tok_text == LLAMA_TOKEN_NULL){
auto & curr_chunk = slot.prompt_tokens[slot.n_past];
if (curr_chunk.txt == LLAMA_TOKEN_NULL){
break; // end of text chunk
}

// without pooling, we want to output the embeddings for all the tokens in the batch
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;

common_batch_add(batch, curr_chunk.tok_text, slot.n_past,{slot.id }, need_embd);
common_batch_add(batch, curr_chunk.txt, slot.n_past,{slot.id }, need_embd);
if (slot.params.cache_prompt){
slot.cache_tokens.add_text_token(curr_chunk.tok_text);
slot.cache_tokens.add_text_token(curr_chunk.txt);
}

slot.n_prompt_tokens_processed++;
Expand All@@ -3261,10 +3266,10 @@ struct server_context{
common_sampler_reset(slot.smpl);

// Process all prompt tokens through sampler system
for (size_t i = 0; i < slot.cache_tokens.n_tokens(); ++i){
auto & curr_chunk = slot.prompt_tokens.get_chunk(i);
if (curr_chunk.tok_text != LLAMA_TOKEN_NULL){
common_sampler_accept(slot.smpl, curr_chunk.tok_text, false);
for (size_t i = 0; i < slot.cache_tokens.size(); ++i){
auto & cur_tok = slot.prompt_tokens[i];
if (cur_tok.txt != LLAMA_TOKEN_NULL){
common_sampler_accept(slot.smpl, cur_tok.txt, false);
}
}

Expand All@@ -3289,7 +3294,6 @@ struct server_context{
return;
}

// debug
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);

if (slot_batched){
Expand All@@ -3303,7 +3307,7 @@ struct server_context{
for (int32_t i = 0; i < batch.n_tokens; i += n_batch){
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);

llama_batch batch_view = llama_batch{
llama_batch batch_view ={
n_tokens,
batch.token + i,
nullptr,
Expand DownExpand Up@@ -4072,38 +4076,43 @@ int main(int argc, char ** argv){

// process files
std::vector<mtmd_bitmap> bitmaps;
const bool has_mtmd = ctx_server.mctx != nullptr;
{
if (!has_mtmd && !files.empty()){
throw std::runtime_error("This server does not support multimodal");
}
for (auto & file : files){
mtmd_bitmap bmp;
int32_t res = mtmd_helper_bitmap_init_from_buf(file.data(), file.size(), bmp);
if (res != 0){
throw std::runtime_error("Failed to load image");
}
// calculate bitmap hash (for KV caching)
bmp.id = server_inputs::fnv_hash(bmp.data.data(), bmp.data.size());
bmp.id = server_tokens::fnv_hash(bmp.data.data(), bmp.data.size());
bitmaps.push_back(std::move(bmp));
}
}

std::vector<server_inputs> inputs;
if (oaicompat){
if (!prompt.is_string()){
throw std::runtime_error("prompt must be a string");
} else{
// SRV_INF("prompt: %s\n", prompt.get<std::string>().c_str());
mtmd_input_text inp_txt ={
prompt.get<std::string>(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd_input_chunks chunks;
int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
if (tokenized != 0){
throw std::runtime_error("Failed to tokenize prompt");
}
server_inputs tmp(chunks);
inputs.push_back(std::move(tmp));
// process prompt
std::vector<server_tokens> inputs;
if (oaicompat && !prompt.is_string()){
throw std::runtime_error("prompt must be a string");

} else if (oaicompat && has_mtmd){
// multimodal
mtmd_input_text inp_txt ={
prompt.get<std::string>(),
/* add_special */ true,
/* parse_special */ true,
};
mtmd_input_chunks chunks;
int32_t tokenized = mtmd_tokenize(ctx_server.mctx, chunks, inp_txt, bitmaps);
if (tokenized != 0){
throw std::runtime_error("Failed to tokenize prompt");
}
server_tokens tmp(chunks, true);
inputs.push_back(std::move(tmp));

} else{
// non-multimodal version
auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
Expand Down
Loading
Loading