|
Plaintext Qwen3ForCausalLM(
(model): Qwen3Model(
(embed_tokens): Embedding(151936, 5120, padding_idx=151654)
(layers): ModuleList(
(0-5): 6 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear4bit(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(6): Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(7): Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(8-22): 15 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear4bit(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(23-44): 22 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(45): Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(46-54): 9 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(55-61): 7 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear4bit(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(62): Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
(63): Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear4bit(in_features=5120, out_features=8192, bias=False)
(k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
(o_proj): Linear4bit(in_features=8192, out_features=5120, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): Qwen3MLP(
(gate_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(up_proj): Linear4bit(in_features=5120, out_features=25600, bias=False)
(down_proj): Linear4bit(in_features=25600, out_features=5120, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((5120,), eps=1e-06)
)
)
(norm): Qwen3RMSNorm((5120,), eps=1e-06)
(rotary_emb): LlamaRotaryEmbedding()
)
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
) |
所有评论(0)