- Notifications
You must be signed in to change notification settings - Fork 14.2k
[model] Add support for Plamo3#17304
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base:master
Are you sure you want to change the base?
Conversation
…lama.cpp into features/suppert-plamo-3
CISC commented Nov 17, 2025
Any non-gated models available? |
mmnga commented Nov 17, 2025
There are no non-gated models available at the moment. |
mmnga commented Nov 17, 2025
Sorry, the checks failed, so I’m reverting it to draft for now. |
CISC commented Nov 17, 2025
The |
mmnga commented Nov 18, 2025
I’ve reopened this PR. Thank you in advance. |
mmnga commented Dec 1, 2025
When you have time, I’d appreciate a quick look or any feedback on this PR. |
CISC left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for the late review.
Make the changes, then rebase to resolve llama-arch.cpp changes.
| # PLaMo models use a custom tokenizer with a .jsonl file | ||
| tokenizer_jsonl_path=self.dir_model/"tokenizer.jsonl" | ||
| tokenizer_config_path=self.dir_model/"tokenizer_config.json" | ||
| ifnottokenizer_jsonl_path.is_file(): | ||
| raiseFileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") | ||
| # Load tokenizer config | ||
| withopen(tokenizer_config_path, "r", encoding="utf-8") asf: | ||
| tokenizer_config=json.load(f) | ||
| # Load tokens from JSONL file (actually a list format) | ||
| tokens= [] | ||
| scores= [] | ||
| toktypes= [] | ||
| withopen(tokenizer_jsonl_path, "r", encoding="utf-8") asf: | ||
| forline_num, lineinenumerate(f): | ||
| ifline.strip(): | ||
| token_data=json.loads(line) | ||
| # Format: [token, score, type, ?, ?, ?, ?] | ||
| token=token_data[0].encode("utf-8") | ||
| score=float(token_data[1]) | ||
| token_type_str=token_data[2] iflen(token_data) >2else"NORMAL" | ||
| tokens.append(token) | ||
| scores.append(score) | ||
| iftoken_type_str=="UNKNOWN": | ||
| toktypes.append(gguf.TokenType.UNKNOWN) | ||
| eliftoken_type_str=="CONTROL": | ||
| toktypes.append(gguf.TokenType.CONTROL) | ||
| eliftoken_type_str=="BYTE": | ||
| toktypes.append(gguf.TokenType.BYTE) | ||
| else: | ||
| token_str=token_data[0] | ||
| iftoken_str.startswith("<|plamo:") andtoken_str.endswith("|>"): | ||
| toktypes.append(gguf.TokenType.CONTROL) | ||
| else: | ||
| toktypes.append(gguf.TokenType.NORMAL) | ||
| vocab_size=self.hparams["vocab_size"] | ||
| ifvocab_size>len(tokens): | ||
| pad_count=vocab_size-len(tokens) | ||
| logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") | ||
| foriinrange(1, pad_count+1): | ||
| tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) | ||
| scores.append(-1000.0) | ||
| toktypes.append(gguf.TokenType.UNUSED) | ||
| self.gguf_writer.add_tokenizer_model("plamo2") | ||
| self.gguf_writer.add_tokenizer_pre("default") | ||
| self.gguf_writer.add_token_list(tokens) | ||
| self.gguf_writer.add_token_scores(scores) | ||
| self.gguf_writer.add_token_types(toktypes) | ||
| if"bos_token"intokenizer_configandtokenizer_config["bos_token"] isnotNone: | ||
| token_id=tokens.index(tokenizer_config["bos_token"].encode("utf-8")) | ||
| self.gguf_writer.add_bos_token_id(token_id) | ||
| if"eos_token"intokenizer_configandtokenizer_config["eos_token"] isnotNone: | ||
| token_id=tokens.index(tokenizer_config["eos_token"].encode("utf-8")) | ||
| self.gguf_writer.add_eos_token_id(token_id) | ||
| if"pad_token"intokenizer_configandtokenizer_config["pad_token"] isnotNone: | ||
| token_id=tokens.index(tokenizer_config["pad_token"].encode("utf-8")) | ||
| self.gguf_writer.add_pad_token_id(token_id) | ||
| if"sep_token"intokenizer_configandtokenizer_config["sep_token"] isnotNone: | ||
| token_id=tokens.index(tokenizer_config["sep_token"].encode("utf-8")) | ||
| self.gguf_writer.add_sep_token_id(token_id) | ||
| if"unk_token"intokenizer_configandtokenizer_config["unk_token"] isnotNone: | ||
| token_id=tokens.index(tokenizer_config["unk_token"].encode("utf-8")) | ||
| self.gguf_writer.add_unk_token_id(token_id) | ||
| # Add <|plamo:op|> as EOT to ensure appropriate end of generation | ||
| self.gguf_writer.add_eot_token_id(4) | ||
| self.gguf_writer.add_add_space_prefix(False) | ||
| if"chat_template"intokenizer_configandtokenizer_config["chat_template"] isnotNone: | ||
| self.gguf_writer.add_chat_template(tokenizer_config["chat_template"]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| # PLaMo models use a custom tokenizer with a .jsonl file | |
| tokenizer_jsonl_path=self.dir_model/"tokenizer.jsonl" | |
| tokenizer_config_path=self.dir_model/"tokenizer_config.json" | |
| ifnottokenizer_jsonl_path.is_file(): | |
| raiseFileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}") | |
| # Load tokenizer config | |
| withopen(tokenizer_config_path, "r", encoding="utf-8") asf: | |
| tokenizer_config=json.load(f) | |
| # Load tokens from JSONL file (actually a list format) | |
| tokens= [] | |
| scores= [] | |
| toktypes= [] | |
| withopen(tokenizer_jsonl_path, "r", encoding="utf-8") asf: | |
| forline_num, lineinenumerate(f): | |
| ifline.strip(): | |
| token_data=json.loads(line) | |
| # Format: [token, score, type, ?, ?, ?, ?] | |
| token=token_data[0].encode("utf-8") | |
| score=float(token_data[1]) | |
| token_type_str=token_data[2] iflen(token_data) >2else"NORMAL" | |
| tokens.append(token) | |
| scores.append(score) | |
| iftoken_type_str=="UNKNOWN": | |
| toktypes.append(gguf.TokenType.UNKNOWN) | |
| eliftoken_type_str=="CONTROL": | |
| toktypes.append(gguf.TokenType.CONTROL) | |
| eliftoken_type_str=="BYTE": | |
| toktypes.append(gguf.TokenType.BYTE) | |
| else: | |
| token_str=token_data[0] | |
| iftoken_str.startswith("<|plamo:") andtoken_str.endswith("|>"): | |
| toktypes.append(gguf.TokenType.CONTROL) | |
| else: | |
| toktypes.append(gguf.TokenType.NORMAL) | |
| vocab_size=self.hparams["vocab_size"] | |
| ifvocab_size>len(tokens): | |
| pad_count=vocab_size-len(tokens) | |
| logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") | |
| foriinrange(1, pad_count+1): | |
| tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) | |
| scores.append(-1000.0) | |
| toktypes.append(gguf.TokenType.UNUSED) | |
| self.gguf_writer.add_tokenizer_model("plamo2") | |
| self.gguf_writer.add_tokenizer_pre("default") | |
| self.gguf_writer.add_token_list(tokens) | |
| self.gguf_writer.add_token_scores(scores) | |
| self.gguf_writer.add_token_types(toktypes) | |
| if"bos_token"intokenizer_configandtokenizer_config["bos_token"] isnotNone: | |
| token_id=tokens.index(tokenizer_config["bos_token"].encode("utf-8")) | |
| self.gguf_writer.add_bos_token_id(token_id) | |
| if"eos_token"intokenizer_configandtokenizer_config["eos_token"] isnotNone: | |
| token_id=tokens.index(tokenizer_config["eos_token"].encode("utf-8")) | |
| self.gguf_writer.add_eos_token_id(token_id) | |
| if"pad_token"intokenizer_configandtokenizer_config["pad_token"] isnotNone: | |
| token_id=tokens.index(tokenizer_config["pad_token"].encode("utf-8")) | |
| self.gguf_writer.add_pad_token_id(token_id) | |
| if"sep_token"intokenizer_configandtokenizer_config["sep_token"] isnotNone: | |
| token_id=tokens.index(tokenizer_config["sep_token"].encode("utf-8")) | |
| self.gguf_writer.add_sep_token_id(token_id) | |
| if"unk_token"intokenizer_configandtokenizer_config["unk_token"] isnotNone: | |
| token_id=tokens.index(tokenizer_config["unk_token"].encode("utf-8")) | |
| self.gguf_writer.add_unk_token_id(token_id) | |
| # Add <|plamo:op|> as EOT to ensure appropriate end of generation | |
| self.gguf_writer.add_eot_token_id(4) | |
| self.gguf_writer.add_add_space_prefix(False) | |
| if"chat_template"intokenizer_configandtokenizer_config["chat_template"] isnotNone: | |
| self.gguf_writer.add_chat_template(tokenizer_config["chat_template"]) | |
| self._set_vocab_plamo() | |
| tokenizer_config_path=self.dir_model/"tokenizer_config.json" | |
| iftokenizer_config_path.is_file(): | |
| withopen(tokenizer_config_path, encoding="utf-8") asf: | |
| tokenizer_config=json.load(f) | |
| chat_template=tokenizer_config.get("chat_template") | |
| chat_template_jinja=self.dir_model/"chat_template.jinja" | |
| ifchat_template_jinja.is_file(): | |
| withopen(chat_template_jinja, encoding="utf-8") asf: | |
| chat_template=f.read() | |
| ifchat_template: | |
| self.gguf_writer.add_chat_template(chat_template) |
Move the rest of the code into TextModel._set_vocab_plamo and update Plamo2Model.set_vocab to just call `self._set_vocab_plamo().
| hparams=self.hparams | ||
| block_count=hparams["num_hidden_layers"] | ||
| self.gguf_writer.add_vocab_size(hparams["vocab_size"]) | ||
| self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) | ||
| self.gguf_writer.add_embedding_length(hparams["hidden_size"]) | ||
| self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) | ||
| self.gguf_writer.add_block_count(block_count) | ||
| self.gguf_writer.add_head_count(hparams["num_attention_heads"]) | ||
| self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) | ||
| head_dim=hparams["head_dim"] | ||
| self.gguf_writer.add_key_length(head_dim) | ||
| self.gguf_writer.add_value_length(head_dim) | ||
| self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) | ||
| self.gguf_writer.add_rope_freq_base(hparams["rope_theta"]) | ||
| window_size=hparams.get("window_size") orhparams.get("sliding_window") or0 | ||
| self.gguf_writer.add_sliding_window(window_size) | ||
| pattern=self._sliding_window_pattern(block_count) | ||
| iflen(pattern) ==block_countandany(pattern): | ||
| self.gguf_writer.add_sliding_window_pattern(pattern) | ||
| self.gguf_writer.add_file_type(self.ftype) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| hparams=self.hparams | |
| block_count=hparams["num_hidden_layers"] | |
| self.gguf_writer.add_vocab_size(hparams["vocab_size"]) | |
| self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) | |
| self.gguf_writer.add_embedding_length(hparams["hidden_size"]) | |
| self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) | |
| self.gguf_writer.add_block_count(block_count) | |
| self.gguf_writer.add_head_count(hparams["num_attention_heads"]) | |
| self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) | |
| head_dim=hparams["head_dim"] | |
| self.gguf_writer.add_key_length(head_dim) | |
| self.gguf_writer.add_value_length(head_dim) | |
| self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) | |
| self.gguf_writer.add_rope_freq_base(hparams["rope_theta"]) | |
| window_size=hparams.get("window_size") orhparams.get("sliding_window") or0 | |
| self.gguf_writer.add_sliding_window(window_size) | |
| pattern=self._sliding_window_pattern(block_count) | |
| iflen(pattern) ==block_countandany(pattern): | |
| self.gguf_writer.add_sliding_window_pattern(pattern) | |
| self.gguf_writer.add_file_type(self.ftype) | |
| super().set_gguf_parameters() | |
| self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) | |
| if (sliding_window:=self.find_hparam(["window_size", "sliding_window"], optional=True)) isnotNone: | |
| self.gguf_writer.add_sliding_window(sliding_window) | |
| pattern=self._sliding_window_pattern(self.block_count) | |
| iflen(pattern) ==self.block_countandany(pattern): | |
| self.gguf_writer.add_sliding_window_pattern(pattern) |
| results: list[tuple[str, Tensor]] = [] | ||
| if"gate_up_proj.weight"inname: | ||
| name_up=name.replace("gate_up_proj.weight", "up_proj.weight") | ||
| name_gate=name.replace("gate_up_proj.weight", "gate_proj.weight") | ||
| gate_proj_weight, up_proj_weight=torch.chunk(data_torch, 2, dim=0) | ||
| results.append((self.map_tensor_name(name_gate), gate_proj_weight)) | ||
| results.append((self.map_tensor_name(name_up), up_proj_weight)) | ||
| else: | ||
| results.append((self.map_tensor_name(name), data_torch)) | ||
| returnresults |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| results: list[tuple[str, Tensor]] = [] | |
| if"gate_up_proj.weight"inname: | |
| name_up=name.replace("gate_up_proj.weight", "up_proj.weight") | |
| name_gate=name.replace("gate_up_proj.weight", "gate_proj.weight") | |
| gate_proj_weight, up_proj_weight=torch.chunk(data_torch, 2, dim=0) | |
| results.append((self.map_tensor_name(name_gate), gate_proj_weight)) | |
| results.append((self.map_tensor_name(name_up), up_proj_weight)) | |
| else: | |
| results.append((self.map_tensor_name(name), data_torch)) | |
| returnresults | |
| return [(self.map_tensor_name(name), data_torch)] |
| MODEL_TENSOR.ATTN_OUT, | ||
| MODEL_TENSOR.ATTN_POST_NORM, | ||
| MODEL_TENSOR.FFN_NORM, | ||
| MODEL_TENSOR.FFN_GATE, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| MODEL_TENSOR.FFN_GATE, |
| {LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, | ||
| {LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, | ||
| {LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, | ||
| {LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| {LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, |
Take care to update this accordingly after you rebase.
| if (hparams.is_swa_any()){ | ||
| inp_attn_iswa = build_attn_inp_kv_iswa(); | ||
| } else{ | ||
| inp_attn = build_attn_inp_kv(); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make build_plamo3 templated to handle this, like f.ex. smallthinker.
| constint64_t head_dim_q = hparams.n_embd_head_k; | ||
| constint64_t head_dim_v = hparams.n_embd_head_v; | ||
| ggml_tensor * inpL = build_inp_embd(model.tok_embd); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ggml_tensor * inpL = build_inp_embd(model.tok_embd); | |
| ggml_tensor * cur; | |
| ggml_tensor * inpL = build_inp_embd(model.tok_embd); |
| constfloat freq_scale_l = model.get_rope_freq_scale(cparams, il); | ||
| ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); | ||
| ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); | |
| cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); |
| inpL = cur; | ||
| } | ||
| ggml_tensor * cur = inpL; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ggml_tensor * cur = inpL; | |
| cur = inpL; |
| ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur); | ||
| cb(ffn_up, "ffn_up", il); | ||
| ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur); | ||
| cb(ffn_gate, "ffn_gate", il); | ||
| ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up); | ||
| cb(ffn_act, "ffn_act", il); | ||
| cur = build_lora_mm(model.layers[il].ffn_down, ffn_act); | ||
| cb(cur, "ffn_down", il); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur); | |
| cb(ffn_up, "ffn_up", il); | |
| ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur); | |
| cb(ffn_gate, "ffn_gate", il); | |
| ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up); | |
| cb(ffn_act, "ffn_act", il); | |
| cur = build_lora_mm(model.layers[il].ffn_down, ffn_act); | |
| cb(cur, "ffn_down", il); | |
| cur = build_ffn(cur, | |
| model.layers[il].ffn_up, NULL, NULL, | |
| NULL, NULL, NULL, | |
| model.layers[il].ffn_down, NULL, NULL, | |
| NULL, | |
| LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); | |
| cb(cur, "ffn_out", il); |
| hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; | ||
| ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); | ||
| if (hparams.n_swa == 0){ | ||
| hparams.n_swa = 2048; | ||
| } | ||
| hparams.set_swa_pattern(8); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; | |
| ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); | |
| if (hparams.n_swa == 0){ | |
| hparams.n_swa = 2048; | |
| } | |
| hparams.set_swa_pattern(8); | |
| constbool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); | |
| if (found_swa && hparams.n_swa > 0){ | |
| hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; | |
| hparams.rope_freq_base_train_swa = 10000.0f; | |
| hparams.rope_freq_scale_train_swa = 1.0f; | |
| hparams.set_swa_pattern(8); | |
| } else{ | |
| hparams.swa_type = LLAMA_SWA_TYPE_NONE; | |
| } |
Not sure where you were going with this, but since you seem to handle non-swa I'm guessing something like this?
This PR adds support for the PLaMo-3 series (2B, 8B, 31B base models):
PLaMo-3 uses a hybrid architecture with Sliding Window Attention (SWA) and standard full attention layers, as well as a custom FFN layout. This PR wires those pieces into llama.cpp so that the official checkpoints can be converted to GGUF and run with the usual backends.