Skip to content

Conversation

@mmnga
Copy link
Contributor

This PR adds support for the PLaMo-3 series (2B, 8B, 31B base models):

PLaMo-3 uses a hybrid architecture with Sliding Window Attention (SWA) and standard full attention layers, as well as a custom FFN layout. This PR wires those pieces into llama.cpp so that the official checkpoints can be converted to GGUF and run with the usual backends.

@github-actionsgithub-actionsbot added model Model specific python python script changes labels Nov 16, 2025
@mmngammnga closed this Nov 16, 2025
@mmngammnga reopened this Nov 16, 2025
@mmngammnga marked this pull request as ready for review November 17, 2025 09:49
@CISC
Copy link
Collaborator

CISC commented Nov 17, 2025

Any non-gated models available?

@mmnga
Copy link
ContributorAuthor

There are no non-gated models available at the moment.

@mmnga
Copy link
ContributorAuthor

Sorry, the checks failed, so I’m reverting it to draft for now.

@mmngammnga marked this pull request as draft November 17, 2025 13:34
@CISC
Copy link
Collaborator

CISC commented Nov 17, 2025

Sorry, the checks failed, so I’m reverting it to draft for now.

The nvidia-vulkan-cm CI failures are unrelated if that's what you're referring to...

@mmngammnga marked this pull request as ready for review November 18, 2025 15:32
@mmnga
Copy link
ContributorAuthor

I’ve reopened this PR. Thank you in advance.

@mmnga
Copy link
ContributorAuthor

mmnga commented Dec 1, 2025

When you have time, I’d appreciate a quick look or any feedback on this PR.
Happy to update the code if needed. Thanks!

Copy link
Collaborator

@CISCCISC left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the late review.

Make the changes, then rebase to resolve llama-arch.cpp changes.

Comment on lines +4982 to +5060
# PLaMo models use a custom tokenizer with a .jsonl file
tokenizer_jsonl_path=self.dir_model/"tokenizer.jsonl"
tokenizer_config_path=self.dir_model/"tokenizer_config.json"

ifnottokenizer_jsonl_path.is_file():
raiseFileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")

# Load tokenizer config
withopen(tokenizer_config_path, "r", encoding="utf-8") asf:
tokenizer_config=json.load(f)

# Load tokens from JSONL file (actually a list format)
tokens= []
scores= []
toktypes= []

withopen(tokenizer_jsonl_path, "r", encoding="utf-8") asf:
forline_num, lineinenumerate(f):
ifline.strip():
token_data=json.loads(line)
# Format: [token, score, type, ?, ?, ?, ?]
token=token_data[0].encode("utf-8")
score=float(token_data[1])
token_type_str=token_data[2] iflen(token_data) >2else"NORMAL"

tokens.append(token)
scores.append(score)

iftoken_type_str=="UNKNOWN":
toktypes.append(gguf.TokenType.UNKNOWN)
eliftoken_type_str=="CONTROL":
toktypes.append(gguf.TokenType.CONTROL)
eliftoken_type_str=="BYTE":
toktypes.append(gguf.TokenType.BYTE)
else:
token_str=token_data[0]
iftoken_str.startswith("<|plamo:") andtoken_str.endswith("|>"):
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)

vocab_size=self.hparams["vocab_size"]
ifvocab_size>len(tokens):
pad_count=vocab_size-len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
foriinrange(1, pad_count+1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNUSED)

self.gguf_writer.add_tokenizer_model("plamo2")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

if"bos_token"intokenizer_configandtokenizer_config["bos_token"] isnotNone:
token_id=tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
self.gguf_writer.add_bos_token_id(token_id)
if"eos_token"intokenizer_configandtokenizer_config["eos_token"] isnotNone:
token_id=tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
self.gguf_writer.add_eos_token_id(token_id)
if"pad_token"intokenizer_configandtokenizer_config["pad_token"] isnotNone:
token_id=tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
self.gguf_writer.add_pad_token_id(token_id)
if"sep_token"intokenizer_configandtokenizer_config["sep_token"] isnotNone:
token_id=tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
self.gguf_writer.add_sep_token_id(token_id)
if"unk_token"intokenizer_configandtokenizer_config["unk_token"] isnotNone:
token_id=tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
self.gguf_writer.add_unk_token_id(token_id)

# Add <|plamo:op|> as EOT to ensure appropriate end of generation
self.gguf_writer.add_eot_token_id(4)

self.gguf_writer.add_add_space_prefix(False)

if"chat_template"intokenizer_configandtokenizer_config["chat_template"] isnotNone:
self.gguf_writer.add_chat_template(tokenizer_config["chat_template"])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# PLaMo models use a custom tokenizer with a .jsonl file
tokenizer_jsonl_path=self.dir_model/"tokenizer.jsonl"
tokenizer_config_path=self.dir_model/"tokenizer_config.json"
ifnottokenizer_jsonl_path.is_file():
raiseFileNotFoundError(f"PLaMo tokenizer file not found: {tokenizer_jsonl_path}")
# Load tokenizer config
withopen(tokenizer_config_path, "r", encoding="utf-8") asf:
tokenizer_config=json.load(f)
# Load tokens from JSONL file (actually a list format)
tokens= []
scores= []
toktypes= []
withopen(tokenizer_jsonl_path, "r", encoding="utf-8") asf:
forline_num, lineinenumerate(f):
ifline.strip():
token_data=json.loads(line)
# Format: [token, score, type, ?, ?, ?, ?]
token=token_data[0].encode("utf-8")
score=float(token_data[1])
token_type_str=token_data[2] iflen(token_data) >2else"NORMAL"
tokens.append(token)
scores.append(score)
iftoken_type_str=="UNKNOWN":
toktypes.append(gguf.TokenType.UNKNOWN)
eliftoken_type_str=="CONTROL":
toktypes.append(gguf.TokenType.CONTROL)
eliftoken_type_str=="BYTE":
toktypes.append(gguf.TokenType.BYTE)
else:
token_str=token_data[0]
iftoken_str.startswith("<|plamo:") andtoken_str.endswith("|>"):
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.NORMAL)
vocab_size=self.hparams["vocab_size"]
ifvocab_size>len(tokens):
pad_count=vocab_size-len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
foriinrange(1, pad_count+1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(gguf.TokenType.UNUSED)
self.gguf_writer.add_tokenizer_model("plamo2")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
if"bos_token"intokenizer_configandtokenizer_config["bos_token"] isnotNone:
token_id=tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
self.gguf_writer.add_bos_token_id(token_id)
if"eos_token"intokenizer_configandtokenizer_config["eos_token"] isnotNone:
token_id=tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
self.gguf_writer.add_eos_token_id(token_id)
if"pad_token"intokenizer_configandtokenizer_config["pad_token"] isnotNone:
token_id=tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
self.gguf_writer.add_pad_token_id(token_id)
if"sep_token"intokenizer_configandtokenizer_config["sep_token"] isnotNone:
token_id=tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
self.gguf_writer.add_sep_token_id(token_id)
if"unk_token"intokenizer_configandtokenizer_config["unk_token"] isnotNone:
token_id=tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
self.gguf_writer.add_unk_token_id(token_id)
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
self.gguf_writer.add_eot_token_id(4)
self.gguf_writer.add_add_space_prefix(False)
if"chat_template"intokenizer_configandtokenizer_config["chat_template"] isnotNone:
self.gguf_writer.add_chat_template(tokenizer_config["chat_template"])
self._set_vocab_plamo()
tokenizer_config_path=self.dir_model/"tokenizer_config.json"
iftokenizer_config_path.is_file():
withopen(tokenizer_config_path, encoding="utf-8") asf:
tokenizer_config=json.load(f)
chat_template=tokenizer_config.get("chat_template")
chat_template_jinja=self.dir_model/"chat_template.jinja"
ifchat_template_jinja.is_file():
withopen(chat_template_jinja, encoding="utf-8") asf:
chat_template=f.read()
ifchat_template:
self.gguf_writer.add_chat_template(chat_template)

Move the rest of the code into TextModel._set_vocab_plamo and update Plamo2Model.set_vocab to just call `self._set_vocab_plamo().

Comment on lines +5074 to +5097
hparams=self.hparams
block_count=hparams["num_hidden_layers"]

self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
head_dim=hparams["head_dim"]
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_rope_freq_base(hparams["rope_theta"])

window_size=hparams.get("window_size") orhparams.get("sliding_window") or0
self.gguf_writer.add_sliding_window(window_size)

pattern=self._sliding_window_pattern(block_count)
iflen(pattern) ==block_countandany(pattern):
self.gguf_writer.add_sliding_window_pattern(pattern)

self.gguf_writer.add_file_type(self.ftype)
Copy link
Collaborator

@CISCCISCDec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hparams=self.hparams
block_count=hparams["num_hidden_layers"]
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
head_dim=hparams["head_dim"]
self.gguf_writer.add_key_length(head_dim)
self.gguf_writer.add_value_length(head_dim)
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
self.gguf_writer.add_rope_freq_base(hparams["rope_theta"])
window_size=hparams.get("window_size") orhparams.get("sliding_window") or0
self.gguf_writer.add_sliding_window(window_size)
pattern=self._sliding_window_pattern(block_count)
iflen(pattern) ==block_countandany(pattern):
self.gguf_writer.add_sliding_window_pattern(pattern)
self.gguf_writer.add_file_type(self.ftype)
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
if (sliding_window:=self.find_hparam(["window_size", "sliding_window"], optional=True)) isnotNone:
self.gguf_writer.add_sliding_window(sliding_window)
pattern=self._sliding_window_pattern(self.block_count)
iflen(pattern) ==self.block_countandany(pattern):
self.gguf_writer.add_sliding_window_pattern(pattern)

Comment on lines +5114 to +5125
results: list[tuple[str, Tensor]] = []

if"gate_up_proj.weight"inname:
name_up=name.replace("gate_up_proj.weight", "up_proj.weight")
name_gate=name.replace("gate_up_proj.weight", "gate_proj.weight")
gate_proj_weight, up_proj_weight=torch.chunk(data_torch, 2, dim=0)
results.append((self.map_tensor_name(name_gate), gate_proj_weight))
results.append((self.map_tensor_name(name_up), up_proj_weight))
else:
results.append((self.map_tensor_name(name), data_torch))

returnresults
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
results: list[tuple[str, Tensor]] = []
if"gate_up_proj.weight"inname:
name_up=name.replace("gate_up_proj.weight", "up_proj.weight")
name_gate=name.replace("gate_up_proj.weight", "gate_proj.weight")
gate_proj_weight, up_proj_weight=torch.chunk(data_torch, 2, dim=0)
results.append((self.map_tensor_name(name_gate), gate_proj_weight))
results.append((self.map_tensor_name(name_up), up_proj_weight))
else:
results.append((self.map_tensor_name(name), data_torch))
returnresults
return [(self.map_tensor_name(name), data_torch)]

MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
MODEL_TENSOR.FFN_GATE,

{LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
{LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
{LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },

Take care to update this accordingly after you rebase.

Comment on lines +14 to +18
if (hparams.is_swa_any()){
inp_attn_iswa = build_attn_inp_kv_iswa();
} else{
inp_attn = build_attn_inp_kv();
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make build_plamo3 templated to handle this, like f.ex. smallthinker.

constint64_t head_dim_q = hparams.n_embd_head_k;
constint64_t head_dim_v = hparams.n_embd_head_v;

ggml_tensor * inpL = build_inp_embd(model.tok_embd);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
ggml_tensor * cur;
ggml_tensor * inpL = build_inp_embd(model.tok_embd);

constfloat freq_scale_l = model.get_rope_freq_scale(cparams, il);
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);

ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);

inpL = cur;
}

ggml_tensor * cur = inpL;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ggml_tensor * cur = inpL;
cur = inpL;

Comment on lines +94 to +104
ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur);
cb(ffn_up, "ffn_up", il);

ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur);
cb(ffn_gate, "ffn_gate", il);

ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up);
cb(ffn_act, "ffn_act", il);

cur = build_lora_mm(model.layers[il].ffn_down, ffn_act);
cb(cur, "ffn_down", il);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ggml_tensor * ffn_up = build_lora_mm(model.layers[il].ffn_up, cur);
cb(ffn_up, "ffn_up", il);
ggml_tensor * ffn_gate = build_lora_mm(model.layers[il].ffn_gate, cur);
cb(ffn_gate, "ffn_gate", il);
ggml_tensor * ffn_act = ggml_swiglu_split(ctx0, ffn_gate, ffn_up);
cb(ffn_act, "ffn_act", il);
cur = build_lora_mm(model.layers[il].ffn_down, ffn_act);
cb(cur, "ffn_down", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
cb(cur, "ffn_out", il);

Comment on lines +1197 to +1202
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
if (hparams.n_swa == 0){
hparams.n_swa = 2048;
}
hparams.set_swa_pattern(8);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
if (hparams.n_swa == 0){
hparams.n_swa = 2048;
}
hparams.set_swa_pattern(8);
constbool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0){
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
hparams.set_swa_pattern(8);
} else{
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}

Not sure where you were going with this, but since you seem to handle non-swa I'm guessing something like this?

Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment

Labels

modelModel specificpythonpython script changes

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants

@mmnga@CISC@mmngays