@@ -4247,16 +4247,25 @@ struct llm_build_context {
4247
4247
const int n_experts_per_tok = 2 ;
4248
4248
4249
4249
ggml_tensor * logits = ggml_mul_mat (ctx0, model.layers [il].ffn_gate_inp , cur); // [n_tokens, num_experts]
4250
+ cb (logits, " ffn_moe_logits" , il);
4251
+
4250
4252
ggml_tensor * probs = ggml_soft_max (ctx0, logits); // [n_tokens, num_experts]
4253
+ cb (probs, " ffn_moe_probs" , il);
4251
4254
4252
4255
// select experts
4253
4256
ggml_tensor * selected_experts = ggml_top_k (ctx0, probs, n_experts_per_tok); // [n_tokens, num_experts_per_tok]
4254
- ggml_tensor * weights =
4255
- ggml_reshape_2d (ctx0,
4256
- ggml_get_rows (ctx0,
4257
- ggml_reshape_3d (ctx0, probs, 1 , n_experts, n_tokens), selected_experts),
4257
+ ggml_tensor * weights = ggml_get_rows (ctx0,
4258
+ ggml_reshape_3d (ctx0, probs, 1 , n_experts, n_tokens), selected_experts);
4259
+ cb (weights, " ffn_moe_weights" , il);
4260
+
4261
+ weights = ggml_reshape_2d (ctx0, weights,
4258
4262
n_experts_per_tok, n_tokens); // [n_tokens, num_experts_per_tok]
4259
- weights = ggml_div (ctx0, weights, ggml_sum_rows (ctx0, weights)); // [n_tokens, num_experts_per_tok]
4263
+
4264
+ ggml_tensor * weights_sum = ggml_sum_rows (ctx0, weights);
4265
+ cb (weights_sum, " ffn_moe_weights_sum" , il);
4266
+
4267
+ weights = ggml_div (ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
4268
+ cb (weights, " ffn_moe_weights_norm" , il);
4260
4269
4261
4270
// compute expert outputs
4262
4271
ggml_tensor * moe_out;
@@ -4269,19 +4278,30 @@ struct llm_build_context {
4269
4278
ggml_tensor ** ffn_gate_exp = (ggml_tensor **) model.layers [il].ffn_gate_exp ;
4270
4279
ggml_tensor ** ffn_down_exp = (ggml_tensor **) model.layers [il].ffn_down_exp ;
4271
4280
4272
- cur_expert = ggml_mul (ctx0,
4273
- ggml_mul_mat_id (ctx0, ffn_up_exp, n_experts, selected_experts, i, cur),
4274
- ggml_silu (ctx0,
4275
- ggml_mul_mat_id (ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur))); // [n_tokens, n_embd]
4281
+ ggml_tensor * cur_up = ggml_mul_mat_id (ctx0, ffn_up_exp, n_experts, selected_experts, i, cur);
4282
+ cb (cur_up, " ffn_up" , il);
4283
+
4284
+ ggml_tensor * cur_gate = ggml_mul_mat_id (ctx0, ffn_gate_exp, n_experts, selected_experts, i, cur);
4285
+ cb (cur_gate, " ffn_gate" , il);
4286
+
4287
+ cur_gate = ggml_silu (ctx0, cur_gate);
4288
+ cb (cur_gate, " ffn_silu" , il);
4289
+
4290
+ cur_expert = ggml_mul (ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
4291
+ cb (cur_expert, " ffn_gate_par" , il);
4276
4292
4277
4293
cur_expert = ggml_mul_mat_id (ctx0, ffn_down_exp, n_experts, selected_experts, i, cur_expert); // [n_tokens, n_embd]
4294
+ cb (cur_expert, " ffn_down" , il);
4295
+
4278
4296
cur_expert = ggml_mul (ctx0, cur_expert,
4279
4297
ggml_view_2d (ctx0, weights, 1 , n_tokens, weights->nb [1 ], i*weights->nb [0 ]));
4298
+ cb (cur_expert, " ffn_moe_weighted" , il);
4280
4299
4281
4300
if (i == 0 ) {
4282
4301
moe_out = cur_expert;
4283
4302
} else {
4284
4303
moe_out = ggml_add (ctx0, moe_out, cur_expert);
4304
+ cb (moe_out, " ffn_moe_out" , il);
4285
4305
}
4286
4306
}
4287
4307
@@ -5540,6 +5560,14 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
5540
5560
{ " ffn_relu" , OFFLOAD_FUNC },
5541
5561
{ " ffn_sqr(relu)" , OFFLOAD_FUNC },
5542
5562
5563
+ { " ffn_moe_logits" , OFFLOAD_FUNC },
5564
+ { " ffn_moe_probs" , OFFLOAD_FUNC },
5565
+ { " ffn_moe_weights" , OFFLOAD_FUNC_NOP },
5566
+ { " ffn_moe_weights_sum" , OFFLOAD_FUNC },
5567
+ { " ffn_moe_weights_norm" , OFFLOAD_FUNC },
5568
+ { " ffn_moe_weighted" , OFFLOAD_FUNC },
5569
+ { " ffn_moe_out" , OFFLOAD_FUNC },
5570
+
5543
5571
{ " l_out" , OFFLOAD_FUNC },
5544
5572
5545
5573
{ " result_norm" , OFFLOAD_FUNC_EMB },
0 commit comments