diff --git a/docs/zh/dpo_and_lora_guide.md b/docs/zh/dpo_and_lora_guide.md
index a55e6ad3c03..026d2ea3848 100644
--- a/docs/zh/dpo_and_lora_guide.md
+++ b/docs/zh/dpo_and_lora_guide.md
@@ -77,7 +77,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -135,7 +135,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -187,7 +187,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-PT`，推荐使用 SFT 后的模型
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `lora`：Bool 类型，是否 lora 训练，默认`False`。
 
diff --git a/docs/zh/pt_and_cpt_guide.md b/docs/zh/pt_and_cpt_guide.md
index 113b3bbb8d0..895c1a11a57 100644
--- a/docs/zh/pt_and_cpt_guide.md
+++ b/docs/zh/pt_and_cpt_guide.md
@@ -59,7 +59,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -108,7 +108,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-Base-PT`
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `stage`：与训练类型相关，预训练设置`PT`
 
diff --git a/docs/zh/sft_and_lora_guide.md b/docs/zh/sft_and_lora_guide.md
index bba9cb77d86..9f2bd40067d 100644
--- a/docs/zh/sft_and_lora_guide.md
+++ b/docs/zh/sft_and_lora_guide.md
@@ -67,7 +67,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -124,7 +124,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -175,7 +175,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-Base-PT`
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `lora`：Bool 类型，是否 lora 训练，默认`False`。
 
diff --git a/docs/zh/training_arguments.md b/docs/zh/training_arguments.md
index 8adb97b1a17..bdd064981da 100644
--- a/docs/zh/training_arguments.md
+++ b/docs/zh/training_arguments.md
@@ -283,7 +283,7 @@
   --expert_model_parallel_size
                         专家并行的并行度。(`int`, 可选)
 
-  --aux_loss_alpha
+  --router_aux_loss_coef
                         MoE 模型的辅助损失（Auxiliary loss）权重系数。(`float`, 可选, 默认为 0.0001)
 
   --expert_max_capacity
diff --git a/examples/best_practices/DeepSeek-V3/SFT-Practice.md b/examples/best_practices/DeepSeek-V3/SFT-Practice.md
index 2bbff46f2f9..8304b028e75 100644
--- a/examples/best_practices/DeepSeek-V3/SFT-Practice.md
+++ b/examples/best_practices/DeepSeek-V3/SFT-Practice.md
@@ -80,4 +80,4 @@ mpirun bash run_dsv3_4k.sh
 * 在 MoE 模型中，专家间负载不均衡也可能引发 OOM 错误。为此，合理引入 AuxLoss 及其无辅助损失机制至关重要。以下是实验过程中总结的关键注意事项：
     * Gate 计算隔离：e_score_correction_bias 应仅用于门控权重计算，避免传递至后续 FFN 模块。
     * AuxLoss 计算适配：在 SP 或 Subbatch 等并行策略下，需注意 seq_len 的实际取值，确保损失计算正确。
-    * 配置调整：Hugging Face 所提供的部分配置（如 aux_loss_alpha）需结合具体训练场景进行针对性调优。
+    * 配置调整：Hugging Face 所提供的部分配置（如 router_aux_loss_coef）需结合具体训练场景进行针对性调优。
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml
index ef14fb3aa44..35540d96472 100644
--- a/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml
+++ b/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 1024
\ No newline at end of file
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml
index 0092964981c..8f5d40e0b56 100644
--- a/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml
+++ b/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
\ No newline at end of file
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml
index c0f48ac740e..06e9a6d0fb3 100644
--- a/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml
+++ b/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
\ No newline at end of file
diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/config.json b/examples/best_practices/DeepSeek-V3/pretrain/config/config.json
index 8e64a1615dd..ee0afa87007 100644
--- a/examples/best_practices/DeepSeek-V3/pretrain/config/config.json
+++ b/examples/best_practices/DeepSeek-V3/pretrain/config/config.json
@@ -9,8 +9,8 @@
       "AutoModel": "DeepseekV2ModelFast",
       "AutoModelForCausalLM": "DeepseekV2ForCausalLM"
     },
-    "aux_loss_alpha": 0.0001,
-    "aux_loss_free_gamma": 0.0,
+    "router_aux_loss_coef": 0.0001,
+    "moe_router_bias_update_rate": 0.0,
     "bos_token_id": 0,
     "eos_token_id": 1,
     "ep_size": 1,
@@ -61,8 +61,6 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
     "using_flex_token": true,
-    "fuse_rms_norm": true,
-    "fuse_attention_ffn": true,
     "apply_rope_fusion": true,
     "token_drop_steps": 0,
     "recompute_fwd_gate_up": true,
diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml b/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml
index 98980deed53..11d09900ec6 100644
--- a/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml
+++ b/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml
@@ -23,7 +23,6 @@ expert_model_parallel_size: 2
 sharding: "stage1"
 virtual_pipeline_model_parallel_size: 1
 sequence_parallel: 0
-use_flash_attention: true
 max_seq_len: 4097
 learning_rate: 0.000022
 min_lr: 0.00000073333
@@ -48,8 +47,6 @@ distributed_dataloader: 1
 unified_checkpoint: true
 save_total_limit: 2
 skip_profile_timer: false
-fuse_rms_norm: true
-fuse_attention_ffn: true
 apply_rope_fusion: true
 save_sharded_model: false
 load_sharded_model: false
@@ -58,7 +55,7 @@ unified_checkpoint_config: "ignore_merge_optimizer"
 offload_optim: true
 reorder_pipeline_priority: true
 num_nextn_predict_layers: 1
-using_fake_gate: false
+moe_router_force_load_balancing: false
 hidden_dropout_prob: 0.1
 attention_probs_dropout_prob: 0.1
 pre_alloc_memory: 61
\ No newline at end of file
diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml
index 56a4bd83f9d..7496d7741cf 100644
--- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml
+++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -55,7 +55,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true
diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml
index 3b1e6a95b58..e519f71bc30 100644
--- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml
+++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -55,7 +55,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true
diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml
index b49aae15c08..64b82d9435c 100644
--- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml
+++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 32
 
@@ -57,7 +57,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true
diff --git a/examples/best_practices/PaddleOCR-VL/README.md b/examples/best_practices/PaddleOCR-VL/README.md
index f2e3a3d53b6..980f7665e63 100644
--- a/examples/best_practices/PaddleOCR-VL/README.md
+++ b/examples/best_practices/PaddleOCR-VL/README.md
@@ -134,7 +134,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -207,7 +207,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -728,7 +728,7 @@ CUDA_VISIBLE_DEVICES=0 paddleformers-cli train examples/best_practices/PaddleOCR
                         per_device_train_batch_size=2 \
                         per_device_eval_batch_size=2 \
                         gradient_accumulation_steps=32 \
-                        attn_impl=sdpa \
+                        _attn_implementation=sdpa \
                         pre_alloc_memory=18 \
                         device=iluvatar_gpu
 ```
diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml
index 2bb001d40d2..6d1e3debe44 100644
--- a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml
+++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml
index 6f4cbf00a0c..18ec25325ef 100644
--- a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml
+++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/best_practices/tutorials/how_to_train_a_function_call_model.md b/examples/best_practices/tutorials/how_to_train_a_function_call_model.md
index 1301a774a3e..a3f09e83833 100644
--- a/examples/best_practices/tutorials/how_to_train_a_function_call_model.md
+++ b/examples/best_practices/tutorials/how_to_train_a_function_call_model.md
@@ -218,7 +218,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md b/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md
index 49553a08a77..e2674eb538a 100644
--- a/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md
+++ b/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md
@@ -188,7 +188,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md b/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md
index f237076bc9e..9f2d4bceb26 100644
--- a/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md
+++ b/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md
@@ -444,7 +444,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 lora_alpha: 32
diff --git a/examples/best_practices/tutorials/how_to_train_an_emoji_model.md b/examples/best_practices/tutorials/how_to_train_an_emoji_model.md
index 6fda3e9403d..30dc9c711c6 100644
--- a/examples/best_practices/tutorials/how_to_train_an_emoji_model.md
+++ b/examples/best_practices/tutorials/how_to_train_an_emoji_model.md
@@ -267,7 +267,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -408,7 +408,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./checkpoints/paddleformers_qwen3_0p6b_sft_ckpts_emoji/
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/dpo/full.yaml b/examples/config/dpo/full.yaml
index d602ebaa13f..333383b762d 100644
--- a/examples/config/dpo/full.yaml
+++ b/examples/config/dpo/full.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/dpo/full_function_call.yaml b/examples/config/dpo/full_function_call.yaml
index 6484f9a9edb..fd5e2c01fe1 100644
--- a/examples/config/dpo/full_function_call.yaml
+++ b/examples/config/dpo/full_function_call.yaml
@@ -14,7 +14,7 @@ split_multi_turn: False
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_fused_head_and_loss_fn: false
 loss_subbatch_sequence_length: 8192
 
diff --git a/examples/config/dpo/full_tp_pp.yaml b/examples/config/dpo/full_tp_pp.yaml
index b17fe3c0f10..78a930e9946 100644
--- a/examples/config/dpo/full_tp_pp.yaml
+++ b/examples/config/dpo/full_tp_pp.yaml
@@ -14,7 +14,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml
index fe42e75d935..d8c23af56f3 100644
--- a/examples/config/dpo/full_tp_pp_ep.yaml
+++ b/examples/config/dpo/full_tp_pp_ep.yaml
@@ -14,7 +14,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/dpo/lora.yaml b/examples/config/dpo/lora.yaml
index 1e9554a09d5..958d570e323 100644
--- a/examples/config/dpo/lora.yaml
+++ b/examples/config/dpo/lora.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/dpo/lora_tp_pp.yaml b/examples/config/dpo/lora_tp_pp.yaml
index fe4bc3a5feb..6310342b0dc 100644
--- a/examples/config/dpo/lora_tp_pp.yaml
+++ b/examples/config/dpo/lora_tp_pp.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml
index ee1792e4c25..e3129f24ea9 100644
--- a/examples/config/dpo/lora_tp_pp_ep.yaml
+++ b/examples/config/dpo/lora_tp_pp_ep.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml
index c8c353289de..4992234d014 100644
--- a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml
+++ b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 
 ### finetuning
 # base
diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml
index 6c12c1aa070..c4b2e1935a8 100644
--- a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml
+++ b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml
index f5d5a012ce3..a1c3ee5a94a 100644
--- a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml
+++ b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: eager
+_attn_implementation: eager
 
 ### finetuning
 # base
diff --git a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml
index 13a48ad5109..f26f0f3161b 100644
--- a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml
+++ b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: eager
+_attn_implementation: eager
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
index 4fa8d6e0dfb..f2cb052e5ca 100644
--- a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
+++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: sdpa
+_attn_implementation: sdpa
 
 ### finetuning
 # base
diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
index de7d9417d37..c83f56e2cc2 100644
--- a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
+++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: sdpa
+_attn_implementation: sdpa
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml b/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml
index 77ee09f5f19..8b6747eee9c 100644
--- a/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml
+++ b/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml b/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml
index debf40fc597..d9c6d31fdf6 100644
--- a/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml
+++ b/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 
 ### finetuning
 # base
diff --git a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml
index ebaf2c5944c..045899aac7d 100644
--- a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml
+++ b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: eager
+_attn_implementation: eager
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml
index ac28e16e106..91ed81b80b6 100644
--- a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml
+++ b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: eager
+_attn_implementation: eager
 
 ### finetuning
 # base
diff --git a/examples/config/pt/full.yaml b/examples/config/pt/full.yaml
index 74c59f5e8fa..a2e0d46b70e 100644
--- a/examples/config/pt/full.yaml
+++ b/examples/config/pt/full.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/pt/full_offline_data.yaml b/examples/config/pt/full_offline_data.yaml
index d2adcb32a96..2ea68f339f8 100644
--- a/examples/config/pt/full_offline_data.yaml
+++ b/examples/config/pt/full_offline_data.yaml
@@ -7,7 +7,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/pt/full_tp_pp.yaml b/examples/config/pt/full_tp_pp.yaml
index a54f2942716..f4fc4e0f9a9 100644
--- a/examples/config/pt/full_tp_pp.yaml
+++ b/examples/config/pt/full_tp_pp.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/pt/full_tp_pp_ep.yaml b/examples/config/pt/full_tp_pp_ep.yaml
index f724b18e2de..1f6c4e6edad 100644
--- a/examples/config/pt/full_tp_pp_ep.yaml
+++ b/examples/config/pt/full_tp_pp_ep.yaml
@@ -12,7 +12,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/pt/lora.yaml b/examples/config/pt/lora.yaml
index 1ac3323e5ea..f88bc71612d 100644
--- a/examples/config/pt/lora.yaml
+++ b/examples/config/pt/lora.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/pt/lora_tp_pp.yaml b/examples/config/pt/lora_tp_pp.yaml
index 224d6220128..aec0078170f 100644
--- a/examples/config/pt/lora_tp_pp.yaml
+++ b/examples/config/pt/lora_tp_pp.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/pt/lora_tp_pp_ep.yaml b/examples/config/pt/lora_tp_pp_ep.yaml
index 40afabeaaa1..d2800c73d9d 100644
--- a/examples/config/pt/lora_tp_pp_ep.yaml
+++ b/examples/config/pt/lora_tp_pp_ep.yaml
@@ -12,7 +12,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft-vl/full.yaml b/examples/config/sft-vl/full.yaml
index 73d98507e37..9667f143561 100644
--- a/examples/config/sft-vl/full.yaml
+++ b/examples/config/sft-vl/full.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft-vl/full_fsdp.yaml b/examples/config/sft-vl/full_fsdp.yaml
index 35e2525d495..323b6fd645e 100644
--- a/examples/config/sft-vl/full_fsdp.yaml
+++ b/examples/config/sft-vl/full_fsdp.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft-vl/full_tp.yaml b/examples/config/sft-vl/full_tp.yaml
index e7faba53ee7..c2364495a41 100644
--- a/examples/config/sft-vl/full_tp.yaml
+++ b/examples/config/sft-vl/full_tp.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft-vl/lora.yaml b/examples/config/sft-vl/lora.yaml
index 89b8db42029..f7f80245f23 100644
--- a/examples/config/sft-vl/lora.yaml
+++ b/examples/config/sft-vl/lora.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/sft-vl/lora_fsdp.yaml b/examples/config/sft-vl/lora_fsdp.yaml
index e9704e1ac58..694b2009f2d 100644
--- a/examples/config/sft-vl/lora_fsdp.yaml
+++ b/examples/config/sft-vl/lora_fsdp.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/sft-vl/lora_tp.yaml b/examples/config/sft-vl/lora_tp.yaml
index a42c8bc60d5..b674d751692 100644
--- a/examples/config/sft-vl/lora_tp.yaml
+++ b/examples/config/sft-vl/lora_tp.yaml
@@ -13,7 +13,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/sft/full.yaml b/examples/config/sft/full.yaml
index be52732f89f..7c5907060d7 100644
--- a/examples/config/sft/full.yaml
+++ b/examples/config/sft/full.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft/full_function_call.yaml b/examples/config/sft/full_function_call.yaml
index e2edda92d93..ce7d2f58c91 100644
--- a/examples/config/sft/full_function_call.yaml
+++ b/examples/config/sft/full_function_call.yaml
@@ -14,7 +14,7 @@ split_multi_turn: False
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_fused_head_and_loss_fn: False
 loss_subbatch_sequence_length: 8192
 
diff --git a/examples/config/sft/full_tp_pp.yaml b/examples/config/sft/full_tp_pp.yaml
index af75e061e1f..dfba5e4a420 100644
--- a/examples/config/sft/full_tp_pp.yaml
+++ b/examples/config/sft/full_tp_pp.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft/full_tp_pp_ep.yaml b/examples/config/sft/full_tp_pp_ep.yaml
index fb754168c5f..2250a2cb3e0 100644
--- a/examples/config/sft/full_tp_pp_ep.yaml
+++ b/examples/config/sft/full_tp_pp_ep.yaml
@@ -14,7 +14,7 @@ template: qwen3
 
 ### model
 model_name_or_path:  Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/sft/lora.yaml b/examples/config/sft/lora.yaml
index 41b24a597da..9601cea349e 100644
--- a/examples/config/sft/lora.yaml
+++ b/examples/config/sft/lora.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/sft/lora_tp_pp.yaml b/examples/config/sft/lora_tp_pp.yaml
index b88d0d82303..8495b7935d2 100644
--- a/examples/config/sft/lora_tp_pp.yaml
+++ b/examples/config/sft/lora_tp_pp.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/sft/lora_tp_pp_ep.yaml b/examples/config/sft/lora_tp_pp_ep.yaml
index 1c3ab1a87b9..cea763c69fe 100644
--- a/examples/config/sft/lora_tp_pp_ep.yaml
+++ b/examples/config/sft/lora_tp_pp_ep.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml b/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml
index 71eba3c7955..301277e0fd9 100644
--- a/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml
+++ b/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml
@@ -75,11 +75,9 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
 device: xpu
\ No newline at end of file
diff --git a/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml b/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml
index 2dc1856195d..ca16bbda4d9 100644
--- a/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml
+++ b/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml
@@ -75,11 +75,9 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
 device: xpu
\ No newline at end of file
diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml
index 0a293e883a3..0ac32f661cc 100644
--- a/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml
+++ b/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml
index 9cbf220164c..f1c120823cd 100644
--- a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml
+++ b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml
index 26211d2f88a..273caaf378b 100644
--- a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml
+++ b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml
index 2f29a7ee833..b1cbb0dc7fa 100644
--- a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml
+++ b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
index 9aa887f5b50..6aef90622e2 100644
--- a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
+++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
index 093722c055f..0bba74dcb82 100644
--- a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
+++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
diff --git a/examples/experiments/deepseek_v3_pretrain/config/config.json b/examples/experiments/deepseek_v3_pretrain/config/config.json
index 8e64a1615dd..ee0afa87007 100644
--- a/examples/experiments/deepseek_v3_pretrain/config/config.json
+++ b/examples/experiments/deepseek_v3_pretrain/config/config.json
@@ -9,8 +9,8 @@
       "AutoModel": "DeepseekV2ModelFast",
       "AutoModelForCausalLM": "DeepseekV2ForCausalLM"
     },
-    "aux_loss_alpha": 0.0001,
-    "aux_loss_free_gamma": 0.0,
+    "router_aux_loss_coef": 0.0001,
+    "moe_router_bias_update_rate": 0.0,
     "bos_token_id": 0,
     "eos_token_id": 1,
     "ep_size": 1,
@@ -61,8 +61,6 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
     "using_flex_token": true,
-    "fuse_rms_norm": true,
-    "fuse_attention_ffn": true,
     "apply_rope_fusion": true,
     "token_drop_steps": 0,
     "recompute_fwd_gate_up": true,
diff --git a/examples/experiments/deepseek_v3_pretrain/config/configuration.py b/examples/experiments/deepseek_v3_pretrain/config/configuration.py
index 4c475d427ba..53fceddc57b 100644
--- a/examples/experiments/deepseek_v3_pretrain/config/configuration.py
+++ b/examples/experiments/deepseek_v3_pretrain/config/configuration.py
@@ -69,7 +69,7 @@ class DeepseekV2FastConfig(PretrainedConfig):
             Whether to normalize the weights of the routed experts.
         scoring_func (`str`, *optional*, defaults to 'softmax'):
             Method of computing expert weights.
-        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             Auxiliary loss weight coefficient.
         seq_aux = (`bool`, *optional*, defaults to True):
             Whether to compute the auxiliary loss for each individual sample.
@@ -159,7 +159,7 @@ def __init__(
         first_k_dense_replace=0,
         norm_topk_prob=False,
         scoring_func="softmax",
-        aux_loss_alpha=0.001,
+        router_aux_loss_coef=0.001,
         seq_aux=True,
         hidden_act="silu",
         max_position_embeddings=2048,
@@ -234,7 +234,7 @@ def __init__(
         self.first_k_dense_replace = first_k_dense_replace
         self.norm_topk_prob = norm_topk_prob
         self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.seq_aux = seq_aux
         # for backward compatibility
         if num_key_value_heads is None:
diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json
index fd12017e217..31f6e91e008 100644
--- a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json
+++ b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json
@@ -17,7 +17,6 @@
     "sharding": "stage1",
     "virtual_pipeline_model_parallel_size": 1,
     "sequence_parallel": 0,
-    "use_flash_attention": true,
     "max_seq_length": 4097,
     "learning_rate": 2.2e-05,
     "min_learning_rate": 7.333e-07,
@@ -43,8 +42,6 @@
     "unified_checkpoint": true,
     "save_total_limit": 2,
     "skip_profile_timer": false,
-    "fuse_rms_norm": true,
-    "fuse_attention_ffn": true,
     "apply_rope_fusion": true,
     "save_sharded_model": false,
     "load_sharded_model": false,
@@ -53,6 +50,6 @@
     "offload_optim": true,
     "reorder_pipeline_priority": true,
     "num_nextn_predict_layers":1,
-    "using_fake_gate": false,
+    "moe_router_force_load_balancing": false,
     "fa_version": 3
   }
\ No newline at end of file
diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml
index d4893c8e1ae..fcdbd662f44 100644
--- a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml
+++ b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml
@@ -23,7 +23,6 @@ expert_model_parallel_size: 32
 sharding: "stage1"
 virtual_pipeline_model_parallel_size: 1
 sequence_parallel: 0
-use_flash_attention: true
 max_seq_len: 4097
 learning_rate: 0.000022
 min_lr: 0.00000073333
@@ -48,8 +47,6 @@ distributed_dataloader: 1
 unified_checkpoint: true
 save_total_limit: 2
 skip_profile_timer: false
-fuse_rms_norm: true
-fuse_attention_ffn: true
 apply_rope_fusion: true
 save_sharded_model: false
 load_sharded_model: false
@@ -58,7 +55,7 @@ unified_checkpoint_config: "ignore_merge_optimizer"
 offload_optim: true
 reorder_pipeline_priority: true
 num_nextn_predict_layers: 1
-using_fake_gate: false
+moe_router_force_load_balancing: false
 hidden_dropout_prob: 0.1
 attention_probs_dropout_prob: 0.1
 pre_alloc_memory: 61
\ No newline at end of file
diff --git a/examples/experiments/deepseek_v3_pretrain/modeling.py b/examples/experiments/deepseek_v3_pretrain/modeling.py
index 26ab8c34a0d..7bf7e250261 100644
--- a/examples/experiments/deepseek_v3_pretrain/modeling.py
+++ b/examples/experiments/deepseek_v3_pretrain/modeling.py
@@ -258,7 +258,6 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, intermediate_
         self.config = config
         self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
         self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
-        self.fuse_attention_ffn = config.fuse_attention_ffn
         Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_
 
         def linear_dtype_gaurd():
@@ -295,20 +294,13 @@ def linear_dtype_gaurd():
                     has_bias=False,
                 )
             else:
-                if config.fuse_attention_ffn:
-                    self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
-                else:
-                    self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
-                    self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+                self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
                 self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
 
         self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        if self.fuse_attention_ffn:
-            x = swiglu(self.gate_up_fused_proj(x))
-        else:
-            x = swiglu(self.gate_proj(x), self.up_proj(x))
+        x = swiglu(self.gate_up_fused_proj(x))
         out = self.down_proj(x)
         return out
 
@@ -370,7 +362,7 @@ def forward(self, hidden_states):
         # compute gating score
         if self.using_post_norm_recompute:
             logits, norm_out = FusedNormGateFunc.apply(hidden_states, self.norm_weight, self.weight, self.norm_eps)
-            if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+            if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing:
                 logits = FakeGate.apply(
                     hidden_states,
                     self.weight,
@@ -380,7 +372,10 @@ def forward(self, hidden_states):
         else:
             with paddle.amp.auto_cast(False):
                 hidden_states = hidden_states.cast(self.weight.dtype)
-                if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+                if (
+                    hasattr(self.config, "moe_router_force_load_balancing")
+                    and self.config.moe_router_force_load_balancing
+                ):
                     logits = FakeGate.apply(
                         hidden_states,
                         self.weight,
@@ -473,7 +468,7 @@ def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, norm_eps=None
                 p.expert = not self.is_mp_moe
                 logger.info(f"expert no-sync={p.no_sync}-{p.name}")
 
-        self.alpha = config.aux_loss_alpha
+        self.alpha = config.router_aux_loss_coef
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             if self.using_post_norm_recompute:
@@ -1658,8 +1653,7 @@ def forward(
             attention_mask = self._prepare_decoder_attention_mask(
                 attention_mask, (batch_size, seq_length), past_key_values_length, inputs_embeds.dtype
             )  # [bs, 1, seq_len, seq_len]
-            if self.config.use_flash_attention:
-                attention_mask = None if is_casual_mask(attention_mask) else attention_mask
+            attention_mask = None if is_casual_mask(attention_mask) else attention_mask
 
         if self.config.num_nextn_predict_layers > 0:
             inputs_embeds_extra = inputs_embeds[:, -self.config.num_nextn_predict_layers :, :]  # [B, S, D]
@@ -1982,7 +1976,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use
             mark_as_sequence_parallel_parameter(self.weight)
 
     def forward(self, hidden_states):
-        if self.config.fuse_rms_norm:
+        if True:
             return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon)
 
         with paddle.amp.auto_cast(False):
diff --git a/examples/experiments/deepseek_v3_pretrain/run_pretrain.py b/examples/experiments/deepseek_v3_pretrain/run_pretrain.py
index 12f7556dd82..3948b5a0033 100644
--- a/examples/experiments/deepseek_v3_pretrain/run_pretrain.py
+++ b/examples/experiments/deepseek_v3_pretrain/run_pretrain.py
@@ -566,9 +566,7 @@ def main():
 
         # config.using_flex_token = True
         # config.num_nextn_predict_layers = 1
-        # config.using_fake_gate = True
-        # config.fuse_rms_norm = True
-        # config.fuse_attention_ffn = True
+        # config.moe_router_force_load_balancing = True
         # config.apply_rope_fusion = True
         # config.token_drop_steps = 0
         model = model_class.from_config(config, dtype=dtype)
@@ -626,8 +624,8 @@ def main():
         callbacks += [MoeExpertsGradScaleCallback(training_args)]
 
     if getattr(config, "topk_method", None) == "noaux_tc":
-        aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001)
-        callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)]
+        moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001)
+        callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)]
 
     def resume_from_custom_func(model):
         if training_args.resume_from_huggingface_ckpt:
diff --git a/examples/experiments/ernie_pretrain/ernie/model_config.py b/examples/experiments/ernie_pretrain/ernie/model_config.py
index 6548a179ba9..48454181fdc 100644
--- a/examples/experiments/ernie_pretrain/ernie/model_config.py
+++ b/examples/experiments/ernie_pretrain/ernie/model_config.py
@@ -103,7 +103,7 @@ class ModelConfig:
     neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"})
     neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"})
     flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."})
-    attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"})
+    _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"})
 
     # long sequence strategy
     use_long_sequence_strategies: bool = field(
diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json
index 8d5affa0756..f36f380e2db 100644
--- a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json
+++ b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json
@@ -26,7 +26,6 @@
   "use_recompute_moe": false,
   "use_recompute_loss_fn": false,
   "use_rmsnorm": true,
-  "fuse_rms_norm": true,
   "use_bias": false,
   "use_fast_ln": true,
   "fuse_attn_ffn": true,
diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json
index 720e97bf525..cf0428004ce 100644
--- a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json
+++ b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json
@@ -26,7 +26,6 @@
   "use_recompute_moe": false,
   "use_recompute_loss_fn": false,
   "use_rmsnorm": true,
-  "fuse_rms_norm": true,
   "use_bias": false,
   "fuse_attn_ffn": true,
   "fuse_linear": true,
diff --git a/examples/experiments/ernie_pretrain/models/ernie/configuration.py b/examples/experiments/ernie_pretrain/models/ernie/configuration.py
index d3b1cc2ba7e..072e602ef97 100644
--- a/examples/experiments/ernie_pretrain/models/ernie/configuration.py
+++ b/examples/experiments/ernie_pretrain/models/ernie/configuration.py
@@ -88,7 +88,7 @@ def __init__(
         use_recompute_attn=False,
         recompute_use_reentrant=False,
         use_rmsnorm=True,
-        fuse_rms_norm=False,
+        fuse_rms_norm=True,
         fuse_ln=False,
         pad_token_id=0,
         bos_token_id=1,
diff --git a/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py b/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py
index aa1287a5420..901ce17ed0a 100644
--- a/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py
+++ b/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py
@@ -795,7 +795,7 @@ def __init__(self, config, layer_idx):
         self.use_rms_qkv_recompute = config.use_rms_qkv_recompute
         if config.use_rms_qkv_recompute is True:
 
-            assert config.use_rmsnorm is True and config.fuse_rms_norm is True
+            assert config.use_rmsnorm is True
             assert config.fuse_linear is True and config.use_bias is False
 
             assert self.fuse_attn is True
@@ -1012,7 +1012,7 @@ def __init__(self, config, layer_idx):
         if self.use_linear_residual_norm_recompute is True:
             assert config.hidden_dropout_prob == 0.0
             assert config.fuse_linear is True and config.use_bias is False
-            assert config.use_rmsnorm is True and config.fuse_rms_norm is True
+            assert config.use_rmsnorm is True
             self.fused_linear_add_norm = FusedLinearAddNorm(self.hidden_size, config.rms_norm_eps)
             del self.self_attn.o_proj
         else:
diff --git a/examples/experiments/paddlefleet/run_pretrain.py b/examples/experiments/paddlefleet/run_pretrain.py
index 14619bb6d13..d941721a717 100644
--- a/examples/experiments/paddlefleet/run_pretrain.py
+++ b/examples/experiments/paddlefleet/run_pretrain.py
@@ -641,8 +641,8 @@ def main():
         callbacks += [MoeExpertsGradScaleCallback(training_args)]
 
     if getattr(config, "topk_method", None) == "noaux_tc":
-        aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001)
-        callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)]
+        moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001)
+        callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)]
 
     def resume_from_custom_func(model):
         if training_args.resume_from_huggingface_ckpt:
diff --git a/paddleformers/cli/hparams/model_args.py b/paddleformers/cli/hparams/model_args.py
index 90aef9f4b6d..130c48e5c57 100644
--- a/paddleformers/cli/hparams/model_args.py
+++ b/paddleformers/cli/hparams/model_args.py
@@ -122,7 +122,7 @@ class ModelArguments:
         default=False,
         metadata={"help": "GPT3 model, use fast layernorm"},
     )
-    attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"})
+    _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"})
     fuse_gate_detach_matmul: bool = field(
         default=True,
         metadata={"help": "Whether to use the fused gate-detach matmul implementation."},
diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py b/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py
index 01af544be36..79c392e6a35 100644
--- a/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py
+++ b/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py
@@ -68,7 +68,7 @@ class DeepseekV2FastConfig(PretrainedConfig):
             Whether to normalize the weights of the routed experts.
         scoring_func (`str`, *optional*, defaults to 'softmax'):
             Method of computing expert weights.
-        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             Auxiliary loss weight coefficient.
         seq_aux = (`bool`, *optional*, defaults to True):
             Whether to compute the auxiliary loss for each individual sample.
@@ -158,7 +158,7 @@ def __init__(
         first_k_dense_replace=0,
         norm_topk_prob=False,
         scoring_func="softmax",
-        aux_loss_alpha=0.001,
+        router_aux_loss_coef=0.001,
         seq_aux=True,
         hidden_act="silu",
         max_position_embeddings=2048,
@@ -233,7 +233,7 @@ def __init__(
         self.first_k_dense_replace = first_k_dense_replace
         self.norm_topk_prob = norm_topk_prob
         self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.seq_aux = seq_aux
         # for backward compatibility
         if num_key_value_heads is None:
diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py b/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py
index bb64e4098da..759562d5bfe 100644
--- a/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py
+++ b/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py
@@ -259,7 +259,6 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, intermediate_
         self.config = config
         self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
         self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
-        self.fuse_attention_ffn = config.fuse_attention_ffn
         Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_
 
         def linear_dtype_gaurd():
@@ -296,20 +295,13 @@ def linear_dtype_gaurd():
                     has_bias=False,
                 )
             else:
-                if config.fuse_attention_ffn:
-                    self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
-                else:
-                    self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
-                    self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+                self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
                 self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
 
         self.act_fn = ACT2FN[config.hidden_act]
 
     def forward(self, x):
-        if self.fuse_attention_ffn:
-            x = swiglu(self.gate_up_fused_proj(x))
-        else:
-            x = swiglu(self.gate_proj(x), self.up_proj(x))
+        x = swiglu(self.gate_up_fused_proj(x))
         out = self.down_proj(x)
         return out
 
@@ -371,7 +363,7 @@ def forward(self, hidden_states):
         # compute gating score
         if self.using_post_norm_recompute:
             logits, norm_out = FusedNormGateFunc.apply(hidden_states, self.norm_weight, self.weight, self.norm_eps)
-            if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+            if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing:
                 logits = FakeGate.apply(
                     hidden_states,
                     self.weight,
@@ -381,7 +373,10 @@ def forward(self, hidden_states):
         else:
             with paddle.amp.auto_cast(False):
                 hidden_states = hidden_states.cast(self.weight.dtype)
-                if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+                if (
+                    hasattr(self.config, "moe_router_force_load_balancing")
+                    and self.config.moe_router_force_load_balancing
+                ):
                     logits = FakeGate.apply(
                         hidden_states,
                         self.weight,
@@ -474,7 +469,7 @@ def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, norm_eps=None
                 p.expert = not self.is_mp_moe
                 logger.info(f"expert no-sync={p.no_sync}-{p.name}")
 
-        self.alpha = config.aux_loss_alpha
+        self.alpha = config.router_aux_loss_coef
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             if self.using_post_norm_recompute:
@@ -1659,8 +1654,7 @@ def forward(
             attention_mask = self._prepare_decoder_attention_mask(
                 attention_mask, (batch_size, seq_length), past_key_values_length, inputs_embeds.dtype
             )  # [bs, 1, seq_len, seq_len]
-            if self.config.use_flash_attention:
-                attention_mask = None if is_casual_mask(attention_mask) else attention_mask
+            attention_mask = None if is_casual_mask(attention_mask) else attention_mask
 
         if self.config.num_nextn_predict_layers > 0:
             inputs_embeds_extra = inputs_embeds[:, -self.config.num_nextn_predict_layers :, :]  # [B, S, D]
@@ -1983,7 +1977,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use
             mark_as_sequence_parallel_parameter(self.weight)
 
     def forward(self, hidden_states):
-        if self.config.fuse_rms_norm:
+        if True:
             return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon)
 
         with paddle.amp.auto_cast(False):
diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py b/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py
index 45e41aa0215..a9d69077e70 100644
--- a/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py
+++ b/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py
@@ -496,9 +496,7 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args):
 
         # config.using_flex_token = True
         # config.num_nextn_predict_layers = 1
-        # config.using_fake_gate = True
-        # config.fuse_rms_norm = True
-        # config.fuse_attention_ffn = True
+        # config.moe_router_force_load_balancing = True
         # config.apply_rope_fusion = True
         # config.token_drop_steps = 0
         model = model_class.from_config(config, dtype=dtype)
@@ -556,8 +554,8 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args):
         callbacks += [MoeExpertsGradScaleCallback(training_args)]
 
     if getattr(config, "topk_method", None) == "noaux_tc":
-        aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001)
-        callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)]
+        moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001)
+        callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)]
 
     def resume_from_custom_func(model):
         if training_args.resume_from_huggingface_ckpt:
diff --git a/paddleformers/cli/train/dpo/dpo_argument.py b/paddleformers/cli/train/dpo/dpo_argument.py
index 71bc3b42458..a02ad6230ef 100644
--- a/paddleformers/cli/train/dpo/dpo_argument.py
+++ b/paddleformers/cli/train/dpo/dpo_argument.py
@@ -155,4 +155,4 @@ class DPOModelArgument:
     use_quick_lora: bool = field(default=True, metadata={"help": "quick lora"})
 
     # Attention
-    attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"})
+    _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"})
diff --git a/paddleformers/cli/train/dpo/workflow.py b/paddleformers/cli/train/dpo/workflow.py
index 64c1002c26a..86ba1aabca4 100644
--- a/paddleformers/cli/train/dpo/workflow.py
+++ b/paddleformers/cli/train/dpo/workflow.py
@@ -71,8 +71,10 @@ def run_dpo(
     set_seed(training_args.seed)
 
     avaible_attn_impl = AttentionInterface._global_mapping.keys()
-    if model_args.attn_impl not in avaible_attn_impl:
-        raise ValueError(f"Invalid attn_impl: {model_args.attn_impl}, available attn_impl: {avaible_attn_impl}")
+    if model_args._attn_implementation not in avaible_attn_impl:
+        raise ValueError(
+            f"Invalid _attn_implementation: {model_args._attn_implementation}, available _attn_implementation: {avaible_attn_impl}"
+        )
 
     if training_args.loss_type == "orpo":
         training_args.reference_free = True
@@ -148,7 +150,7 @@ def run_dpo(
         model_args.model_name_or_path,
         dtype=dtype,
     )
-    model_config._attn_implementation = model_args.attn_impl
+    model_config._attn_implementation = model_args._attn_implementation
     model_config.pp_seg_method = model_args.pp_seg_method
     model_config.max_sequence_length = data_args.max_seq_len
     model_config.seq_length = data_args.max_seq_len
@@ -164,7 +166,7 @@ def run_dpo(
         ref_model_config.pp_seg_method = model_args.pp_seg_method
         ref_model_config.max_sequence_length = data_args.max_seq_len
         ref_model_config.seq_length = data_args.max_seq_len
-        ref_model_config._attn_implementation = model_args.attn_impl
+        ref_model_config._attn_implementation = model_args._attn_implementation
 
         LlmMetaConfig.set_llm_config(ref_model_config, training_args)
 
diff --git a/paddleformers/cli/train/ernie_pretrain/model_config.py b/paddleformers/cli/train/ernie_pretrain/model_config.py
index 6548a179ba9..48454181fdc 100644
--- a/paddleformers/cli/train/ernie_pretrain/model_config.py
+++ b/paddleformers/cli/train/ernie_pretrain/model_config.py
@@ -103,7 +103,7 @@ class ModelConfig:
     neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"})
     neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"})
     flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."})
-    attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"})
+    _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"})
 
     # long sequence strategy
     use_long_sequence_strategies: bool = field(
diff --git a/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py b/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py
index 31460565972..a97bf124cdc 100644
--- a/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py
+++ b/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py
@@ -88,7 +88,7 @@ def __init__(
         use_recompute_attn=False,
         recompute_use_reentrant=False,
         use_rmsnorm=True,
-        fuse_rms_norm=False,
+        fuse_rms_norm=True,
         fuse_ln=False,
         pad_token_id=0,
         bos_token_id=1,
diff --git a/paddleformers/cli/train/sft/workflow.py b/paddleformers/cli/train/sft/workflow.py
index b3e8e1d2599..73342c2bb28 100644
--- a/paddleformers/cli/train/sft/workflow.py
+++ b/paddleformers/cli/train/sft/workflow.py
@@ -252,20 +252,22 @@ def run_sft(
         model_config.ignore_index = -100
 
     avaible_attn_impl = AttentionInterface._global_mapping.keys()
-    if model_args.attn_impl not in avaible_attn_impl:
-        raise ValueError(f"Invalid attn_impl: {model_args.attn_impl}, available attn_impl: {avaible_attn_impl}")
+    if model_args._attn_implementation not in avaible_attn_impl:
+        raise ValueError(
+            f"Invalid _attn_implementation: {model_args._attn_implementation}, available _attn_implementation: {avaible_attn_impl}"
+        )
 
     model_config.pp_seg_method = model_args.pp_seg_method
     model_config.seq_length = data_args.max_seq_len
     model_config.max_sequence_length = data_args.max_seq_len
-    model_config._attn_implementation = model_args.attn_impl
+    model_config._attn_implementation = model_args._attn_implementation
     model_config.is_lora = model_args.lora
 
     # Sync arguments to MLLM sub_config
     if getattr(model_config, "text_config", None) is not None:
         model_config.text_config.max_sequence_length = data_args.max_seq_len
     if getattr(model_config, "vision_config", None) is not None:
-        model_config.vision_config._attn_implementation = model_args.attn_impl
+        model_config.vision_config._attn_implementation = model_args._attn_implementation
         model_config.vision_config.recompute_granularity = model_config.recompute_granularity
         model_config.vision_config.recompute_method = model_config.recompute_method
         model_config.vision_config.recompute_num_layers = model_config.recompute_num_layers
diff --git a/paddleformers/nn/moe_deepep/modular_moe_layer.py b/paddleformers/nn/moe_deepep/modular_moe_layer.py
index ce463166f25..156559e9d35 100644
--- a/paddleformers/nn/moe_deepep/modular_moe_layer.py
+++ b/paddleformers/nn/moe_deepep/modular_moe_layer.py
@@ -69,12 +69,12 @@ def __init__(
         self.sequence_parallel = pretrained_config.get("sequence_parallel", False)
         self.tensor_model_parallel_size = pretrained_config.get("tensor_model_parallel_size", 1)
         self.seq_length = pretrained_config.get("seq_length", pretrained_config.get("max_seq_len", 1024))
-        self.fuse_up_gate = pretrained_config.get("fuse_attention_ffn", False)
-        self.ep_communication_type = pretrained_config.get("ep_communication_type", "deepep")
+        self.fuse_up_gate = True
+        self.moe_token_dispatcher_type = pretrained_config.get("moe_token_dispatcher_type", "deepep")
         self.n_group = pretrained_config.get("n_group", 1)
         self.topk_group = pretrained_config.get("topk_group", 1)
         self.routed_scaling_factor = pretrained_config.get("routed_scaling_factor", 1.0)
-        self.aux_loss_alpha = pretrained_config.get("aux_loss_alpha", 0.0)
+        self.router_aux_loss_coef = pretrained_config.get("router_aux_loss_coef", 0.0)
         self.moe_subbatch_token_num_before_dispatch = pretrained_config.get(
             "moe_subbatch_token_num_before_dispatch", -1
         )
@@ -167,13 +167,13 @@ def __init__(
             self.shared_expert = self.expert_class(**shared_expert_args)
             self.shared_expert_gate = GeneralLinear.create(self.hidden_size, 1, has_bias=False, linear_type="default")
 
-        if self.ep_communication_type == "deepep":
+        if self.moe_token_dispatcher_type == "deepep":
             self.communication = DeepEPMoECommunication()
-        elif self.ep_communication_type == "alltoall":
+        elif self.moe_token_dispatcher_type == "alltoall":
             self.communication = AllToAllMoECommunication()
         else:
             raise ValueError(
-                f"Unsupported communication type: {self.ep_communication_type}, please choose from ['deepep', 'alltoall']"
+                f"Unsupported communication type: {self.moe_token_dispatcher_type}, please choose from ['deepep', 'alltoall']"
             )
 
         if hasattr(dist, "fleet") and dist.is_initialized() and self.expert_model_parallel_size > 1:
@@ -264,8 +264,8 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
                 reshaped_input = hidden_states
             output = self._forward_traditional_moe(reshaped_input, topk_indices, topk_weights)
 
-        if self.training and self.aux_loss_alpha > 0.0:
-            aux_loss = aux_loss * self.aux_loss_alpha
+        if self.training and self.router_aux_loss_coef > 0.0:
+            aux_loss = aux_loss * self.router_aux_loss_coef
             output = AddAuxiliaryLoss.apply(output, aux_loss)
 
         if self.shared_experts is not None:
diff --git a/paddleformers/nn/norm.py b/paddleformers/nn/norm.py
index 27606c5fa6f..72452534908 100644
--- a/paddleformers/nn/norm.py
+++ b/paddleformers/nn/norm.py
@@ -17,11 +17,14 @@
 from paddle.distributed.fleet.utils.sequence_parallel_utils import (
     mark_as_sequence_parallel_parameter,
 )
-from paddle.incubate.nn.functional import fused_rms_norm_ext
 
+# from ..cli.utils.process import detect_device
 from ..generation.configuration_utils import PretrainedConfig
 from .general import GeneralInterface
 
+# from paddle.incubate.nn.functional import fused_rms_norm_ext
+
+
 __all__ = ["Norm"]
 
 
@@ -65,8 +68,9 @@ def __init__(self, config: PretrainedConfig, hidden_size=None, norm_eps=None, in
             self.enable_sequence_parallel()
 
     def forward(self, hidden_states):
-        if self.config.get("fuse_rms_norm", False):
-            return fused_rms_norm_ext(hidden_states, self.weight, self.variance_epsilon)[0].astype(self.weight.dtype)
+        # current_device = detect_device()
+        # if self.config.get("fuse_rms_norm", False) and current_device != "iluvatar_gpu":
+        #     return fused_rms_norm_ext(hidden_states, self.weight, self.variance_epsilon)[0].astype(self.weight.dtype)
 
         if paddle.in_dynamic_mode():
             with paddle.amp.auto_cast(False):
diff --git a/paddleformers/nn/pp_model.py b/paddleformers/nn/pp_model.py
index fd7d8a12057..ad8d4a09a27 100644
--- a/paddleformers/nn/pp_model.py
+++ b/paddleformers/nn/pp_model.py
@@ -122,7 +122,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1):
         config (Config): Model configuration object containing:
             - num_hidden_layers (int): Total number of transformer layers
             - virtual_pipeline_model_parallel_size (int): Virtual pipeline parallelism degree
-            - add_tail_layers (int): Additional tail layers to append
+            - num_empty_layers_add_in_tail (int): Additional tail layers to append
         skip_recompute_num (int): Number of layers per virtual pipeline stage
             to exclude from recomputation. Defaults to -1 (auto-configure).
     Returns:
@@ -139,7 +139,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1):
     assert pp_size > 1, (
         "Only support pipeline parallel, " f"pp_size must be greater than 1, but got pp_size: {pp_size}"
     )
-    layer_num = config.num_hidden_layers + config.add_tail_layers
+    layer_num = config.num_hidden_layers + config.num_empty_layers_add_in_tail
 
     if skip_recompute_num == -1:
         # select all layers to skip recompute
@@ -614,7 +614,7 @@ def __init__(self, config: PretrainedConfig, **kwargs):
                     LayerDesc(MTPLayerPipeCls, config=config, layer_idx=config.num_hidden_layers + i),
                     f"model.layers.{config.num_hidden_layers + i}",
                 )
-        for i in range(config.add_tail_layers):
+        for i in range(config.num_empty_layers_add_in_tail):
             self.add_sequential_layer(
                 LayerDesc(
                     EmptyLayer,
@@ -651,7 +651,9 @@ def __init__(self, config: PretrainedConfig, **kwargs):
 
         if (
             seg_method == "layer:DecoderLayer|EmptyLayer"
-            and (config.num_hidden_layers + config.add_tail_layers) % get_hcg().topology().get_dim_size("pipe") != 0
+            and (config.num_hidden_layers + config.num_empty_layers_add_in_tail)
+            % get_hcg().topology().get_dim_size("pipe")
+            != 0
         ):
             seg_method = "uniform"
         logger.info(f"using recompute_interval={recompute_interval}, seg_method={seg_method}")
diff --git a/paddleformers/trainer/training_args.py b/paddleformers/trainer/training_args.py
index 595283d90bd..b5cd819dab0 100644
--- a/paddleformers/trainer/training_args.py
+++ b/paddleformers/trainer/training_args.py
@@ -1092,7 +1092,7 @@ class TrainingArguments:
         default=False,
         metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
     )
-    aux_loss_alpha: Optional[float] = field(
+    router_aux_loss_coef: Optional[float] = field(
         default=0.0001,
         metadata={"help": "MoE (Mixture of Experts) Auxiliary loss weight coefficient"},
     )
diff --git a/paddleformers/transformers/attention_utils.py b/paddleformers/transformers/attention_utils.py
index cf8ced7b334..6d79f9a08ae 100755
--- a/paddleformers/transformers/attention_utils.py
+++ b/paddleformers/transformers/attention_utils.py
@@ -553,7 +553,7 @@ def __init__(
         self.v_proj = Linear3D(embed_dim, num_heads, self.head_dim, weight_attr, bias_attr=bias_attr)
         self.out_proj = nn.Linear(embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
 
-        self.attn_impl = AttentionRegistry.cls_dict[attention_type](
+        self._attn_implementation = AttentionRegistry.cls_dict[attention_type](
             num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed
         )
 
@@ -603,7 +603,9 @@ def forward(
         else:
             q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
-        out = self.attn_impl(q, k, v, self.head_dim, attn_mask, rand_mask_idx, query_mask, key_mask, self.dropout)
+        out = self._attn_implementation(
+            q, k, v, self.head_dim, attn_mask, rand_mask_idx, query_mask, key_mask, self.dropout
+        )
         # combine heads
         out = paddle.transpose(out, perm=[0, 2, 1, 3])
         out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
index 50a19c566e2..b83b3c40393 100644
--- a/paddleformers/transformers/configuration_utils.py
+++ b/paddleformers/transformers/configuration_utils.py
@@ -229,13 +229,9 @@ def llmmetaclass(cls):
 class LlmMetaConfig:
     op_fusion_attributes = [
         # name, type, default_value, comment
-        ("use_flash_attention", bool, False, "Only used in `ernie45_vl` and `deepseek_v3_pretrain`."),
-        ("fuse_rms_norm", bool, False, "Whether to fuse RMSNorm for efficiency"),
         ("use_fused_linear_cross_entropy", bool, False, "use fused `linear + cross_entropy` fuse op."),
         ("apply_rope_fusion", bool, False, "Whether to fuse RoPE operation"),
         ("fuse_swiglu", bool, False, "Whether to fuse SwiGLU operations"),
-        ("fuse_attention_qkv", bool, False, "Whether to fuse Attention QKV operations"),
-        ("fuse_attention_ffn", bool, False, "Whether to fuse Attention FFN operations"),
     ]
 
     hybrid_parallel_attributes = [
@@ -254,7 +250,7 @@ class LlmMetaConfig:
         ("context_parallel_size", int, 1, "context_parallel_size"),
         # pp refine recompute
         ("no_recompute_layers", Optional[List[int]], None, "no_recompute_layers"),
-        ("add_tail_layers", int, 0, "Additional layers to append at the end"),
+        ("num_empty_layers_add_in_tail", int, 0, "Additional layers to append at the end"),
         # sep_parallel
         ("sep_parallel_size", int, 1, "sep_parallel_size"),
         ("context_parallel_size", int, 1, "context_parallel_size"),
@@ -302,8 +298,8 @@ class LlmMetaConfig:
             0,
             "The number of tokens in each subbatch for MoE model processing.",
         ),
-        ("using_fake_gate", bool, False, "Whether to fake gate."),
-        ("ep_communication_type", str, "deepep", 'Communication type used by MoE module "deepep" or "alltoall". '),
+        ("moe_router_force_load_balancing", bool, False, "Whether to fake gate."),
+        ("moe_token_dispatcher_type", str, "deepep", 'Communication type used by MoE module "deepep" or "alltoall". '),
         ("use_unified_moe", bool, False, "Whether to use unified moe."),
         (
             "moe_deepep_num_sms",
@@ -748,7 +744,7 @@ class PretrainedConfig:
             `"single_label_classification"` or `"multi_label_classification"`.
         moe_subbatch_token_num_before_dispatch (`int`, *optional*, defaults to 0):
             The number of tokens in a subbatch for MoE.
-        ep_communication_type (`str`, *optional*, defaults to `deepep`):
+        moe_token_dispatcher_type (`str`, *optional*, defaults to `deepep`):
             Communication type for expert parallel. Can be one of `deepep`, `alltoall`.
         use_unified_moe (`bool`, *optional*, defaults to `False`):
             Whether to use unified MoE.
@@ -829,8 +825,6 @@ def __init__(self, **kwargs):
         llm_meta = LlmMetaConfig._get_init()
         self._unsavable_keys.update(LlmMetaConfig._get_unsavable_keys())
         self._unsavable_keys.remove("tensor_model_parallel_size")
-        self._unsavable_keys.remove("fuse_attention_qkv")
-        self._unsavable_keys.remove("fuse_attention_ffn")
         self._unsavable_keys.add("_attn_implementation")
 
         kwargs = set_expected_keys(self, llm_meta, kwargs)
@@ -853,10 +847,6 @@ def __init__(self, **kwargs):
             self.sep_parallel_size = 1
             self.context_parallel_size = 1
 
-        # for transformers fuse
-        self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
-        self.fuse_attention_ffn = kwargs.pop("fuse_attention_ffn", False)
-
         # for general components
         self._attn_implementation = kwargs.pop("_attn_implementation", "eager")
 
@@ -905,9 +895,9 @@ def __init__(self, **kwargs):
         self.dpo_config = kwargs.pop("dpo_config", None)
         self.kto_config = kwargs.pop("kto_config", None)
 
-        self.ep_communication_type = kwargs.pop("ep_communication_type", "deepep")
+        self.moe_token_dispatcher_type = kwargs.pop("moe_token_dispatcher_type", "deepep")
         self.use_unified_moe = kwargs.pop("use_unified_moe", False)
-        self.using_fake_gate = kwargs.pop("using_fake_gate", False)
+        self.moe_router_force_load_balancing = kwargs.pop("moe_router_force_load_balancing", False)
 
         # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
         self.tokenizer_class = kwargs.pop("tokenizer_class", None)
diff --git a/paddleformers/transformers/deepseek_v3/configuration.py b/paddleformers/transformers/deepseek_v3/configuration.py
index d61c7c01782..fd0bfd0ff11 100644
--- a/paddleformers/transformers/deepseek_v3/configuration.py
+++ b/paddleformers/transformers/deepseek_v3/configuration.py
@@ -70,7 +70,7 @@ class DeepseekV3Config(PretrainedConfig):
             Whether to normalize the weights of the routed experts.
         scoring_func (`str`, *optional*, defaults to 'softmax'):
             Method of computing expert weights.
-        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
             Auxiliary loss weight coefficient.
         seq_aux = (`bool`, *optional*, defaults to True):
             Whether to compute the auxiliary loss for each individual sample.
@@ -161,7 +161,7 @@ def __init__(
         first_k_dense_replace=0,
         norm_topk_prob=False,
         scoring_func="softmax",
-        aux_loss_alpha=0.0001,
+        router_aux_loss_coef=0.0001,
         seq_aux=True,
         hidden_act="silu",
         max_position_embeddings=2048,
@@ -208,7 +208,7 @@ def __init__(
         self.first_k_dense_replace = first_k_dense_replace
         self.norm_topk_prob = norm_topk_prob
         self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.seq_aux = seq_aux
         # for backward compatibility
         if num_key_value_heads is None:
diff --git a/paddleformers/transformers/deepseek_v3/modeling.py b/paddleformers/transformers/deepseek_v3/modeling.py
index cb4f68fdb30..8d5d3274731 100644
--- a/paddleformers/transformers/deepseek_v3/modeling.py
+++ b/paddleformers/transformers/deepseek_v3/modeling.py
@@ -300,7 +300,7 @@ def forward(self, hidden_states):
         with paddle.amp.auto_cast(False):
             hidden_states = hidden_states.cast(self.weight.dtype)
 
-            if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+            if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing:
                 logits = FakeGate.apply(hidden_states, self.weight)
             else:
                 logits = F.linear(hidden_states, self.weight, None)
@@ -488,7 +488,7 @@ def __init__(self, config: DeepseekV3Config):
             if self.is_mp_moe or self.is_ep_moe:
                 p.is_distributed = True
 
-        self.alpha = config.aux_loss_alpha
+        self.alpha = config.router_aux_loss_coef
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
             self.shared_experts = DeepseekV3MLP(config=config, intermediate_size=intermediate_size)
@@ -1129,7 +1129,6 @@ def _gen_aoa_config(cls, config: DeepseekV3Config):
                 f"model.layers.$LAYER_ID.mlp.shared_experts.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.down_proj.weight",
             ]
         }
-
         if config.q_lora_rank:
             aoa_config["aoa_statements"] += [
                 f"model.layers.$LAYER_ID.self_attn.q_{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.q_{x}_proj.weight"
@@ -1138,13 +1137,11 @@ def _gen_aoa_config(cls, config: DeepseekV3Config):
             aoa_config["aoa_statements"] += [
                 f"model.layers.$LAYER_ID.self_attn.q_a_layernorm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.q_a_layernorm.weight"
             ]
-
         aoa_config["aoa_statements"] += [
             f"model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight",
             f"model.layers.$LAYER_ID.self_attn.kv_b_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.kv_b_proj.weight",
             f"model.layers.$LAYER_ID.self_attn.kv_a_layernorm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.kv_a_layernorm.weight",
         ]
-
         if config.attention_bias:
             aoa_config["aoa_statements"] += [
                 f"model.layers.$LAYER_ID.self_attn.q_a_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.q_a_proj.bias",
@@ -1152,43 +1149,30 @@ def _gen_aoa_config(cls, config: DeepseekV3Config):
             ]
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
-                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
+            for x in ("q", "k", "v")
+        ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
+            for x in ("q", "k", "v")
+        ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += (
-                [
-                    f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-                + [
-                    f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-                + [
-                    f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-            )
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-                f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn",
-                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
+        aoa_config["aoa_statements"] += (
+            [
+                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
+                for p in ("gate", "up")
+            ]
+            + [
+                f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight"
+                for p in ("gate", "up")
             ]
+            + [
+                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight"
+                for p in ("gate", "up")
+            ]
+        )
 
         return aoa_config
 
@@ -1210,7 +1194,6 @@ def _gen_inv_aoa_config(cls, config: DeepseekV3Config):
             f"{model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight -> model.layers.$LAYER_ID.post_attention_layernorm.weight",
             f"{model_prefix}layers.$LAYER_ID.mlp.gate.e_score_correction_bias -> model.layers.$LAYER_ID.mlp.gate.e_score_correction_bias",
         ]
-
         if config.q_lora_rank:
             aoa_statements += [
                 f"{model_prefix}layers.$LAYER_ID.self_attn.q_{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.q_{x}_proj.weight"
@@ -1219,82 +1202,41 @@ def _gen_inv_aoa_config(cls, config: DeepseekV3Config):
             aoa_statements += [
                 f"{model_prefix}layers.$LAYER_ID.self_attn.q_a_layernorm.weight -> model.layers.$LAYER_ID.self_attn.q_a_layernorm.weight"
             ]
-
         aoa_statements += [
             f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight^T -> model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight",
             f"{model_prefix}layers.$LAYER_ID.self_attn.kv_b_proj.weight^T -> model.layers.$LAYER_ID.self_attn.kv_b_proj.weight",
             f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_layernorm.weight -> model.layers.$LAYER_ID.self_attn.kv_a_layernorm.weight",
         ]
-
         if config.attention_bias:
             aoa_statements += [
                 f"{model_prefix}layers.$LAYER_ID.self_attn.q_a_proj.bias -> model.layers.$LAYER_ID.self_attn.q_a_proj.bias",
                 f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.bias -> model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.bias",
             ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
+            for x in ("q", "k", "v")
+        ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
+            for x in ("q", "k", "v")
+        ]
+
+        aoa_statements += (
+            [
+                f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
+                for y in ("gate", "up")
             ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0",
+            + [
+                f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight"
+                for y in ("gate", "up")
             ]
-            aoa_statements += [
-                f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.num_hidden_layers)
-                for x in ("q", "k", "v")
+            + [
+                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
+                for y in ("gate", "up")
             ]
+        )
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += (
-                [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                    for y in ("gate", "up")
-                ]
-                + [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight"
-                    for y in ("gate", "up")
-                ]
-                + [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
-                    for y in ("gate", "up")
-                ]
-            )
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.0.mlp.up_gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn",
-                f"{model_prefix}layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
-                f"{model_prefix}layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
-                f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn",
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
-            ]
-            aoa_statements += (
-                [
-                    f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
-                    for layer_id in range(1, config.num_hidden_layers)
-                ]
-                + [
-                    f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
-                    for layer_id in range(1, config.num_hidden_layers)
-                ]
-                + [
-                    f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight"
-                    for layer_id in range(1, config.num_hidden_layers)
-                    for expert_id in range(config.n_routed_experts)
-                ]
-                + [
-                    f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight"
-                    for layer_id in range(1, config.num_hidden_layers)
-                    for expert_id in range(config.n_routed_experts)
-                ]
-            )
         aoa_config = {"aoa_statements": aoa_statements}
         return aoa_config
 
diff --git a/paddleformers/transformers/ernie4_5/modeling.py b/paddleformers/transformers/ernie4_5/modeling.py
index e9fda7b09d8..a4d3804b5cf 100644
--- a/paddleformers/transformers/ernie4_5/modeling.py
+++ b/paddleformers/transformers/ernie4_5/modeling.py
@@ -197,7 +197,6 @@ def __init__(self, config, layer_idx=0):
         self.num_key_value_heads = config.num_key_value_heads
         self.head_dim = config.head_dim
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -218,36 +217,13 @@ def __init__(self, config, layer_idx=0):
         kv_hidden_size = self.head_dim * config.num_key_value_heads
         q_hidden_size = self.head_dim * config.num_attention_heads
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                self.hidden_size,
-                q_hidden_size,
-                has_bias=config.use_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                self.hidden_size,
-                kv_hidden_size,
-                has_bias=config.use_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                self.hidden_size,
-                kv_hidden_size,
-                has_bias=config.use_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                self.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.use_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            self.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.use_bias,
+            config=config,
+            tp_plan="colwise",
+        )
 
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
@@ -288,39 +264,27 @@ def forward(
                 - attention_weights: Optional attention probabilities
                 - updated_key_value_cache: Optional updated cache
         """
-        if not self.fuse_attention_qkv:
-            if self.config.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-
-            query_states = self.q_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim])
-            key_states = self.k_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim])
-            value_states = self.v_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.config.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.config.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         # b l h d -> b h l d
         query_states = query_states.transpose(1, 2)
@@ -377,7 +341,7 @@ def __init__(self, config, layer_idx):
         self.layer_idx = layer_idx
         self.config = config
         self.self_attn = Ernie4_5Attention(config, layer_idx)
-        self.mlp = Ernie4_5MLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Ernie4_5MLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
@@ -492,30 +456,18 @@ def _gen_aoa_config(cls, config: Ernie4_5Config):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.use_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.use_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # lm_head
         if config.tie_word_embeddings:
@@ -535,39 +487,27 @@ def _gen_inv_aoa_config(cls, config: Ernie4_5Config):
             f"{model_prefix}norm.weight -> model.norm.weight",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-            ]
-            for layer_id in range(config.num_hidden_layers):
-                for x in ("q", "k", "v"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                    ]
-            if config.use_bias:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for layer_id in range(config.num_hidden_layers):
+            for x in ("q", "k", "v"):
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
                 ]
-
-        if not config.fuse_attention_ffn:
+        if config.use_bias:
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                for y in ("gate", "up")
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-        else:
+
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+        ]
+        for layer_id in range(config.num_hidden_layers):
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+                f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
+                f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
             ]
-            for layer_id in range(config.num_hidden_layers):
-                aoa_statements += [
-                    f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
-                    f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
-                ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
diff --git a/paddleformers/transformers/ernie4_5_moe/modeling.py b/paddleformers/transformers/ernie4_5_moe/modeling.py
index 2eaab83e850..faf2cf2c0ca 100644
--- a/paddleformers/transformers/ernie4_5_moe/modeling.py
+++ b/paddleformers/transformers/ernie4_5_moe/modeling.py
@@ -290,7 +290,7 @@ def __init__(self, config, layer_idx):
                         config.hidden_size,
                         config.moe_intermediate_size,
                         layer_idx,
-                        fuse_up_gate=config.fuse_attention_ffn,
+                        fuse_up_gate=True,
                     )
                 )
             else:
@@ -308,7 +308,7 @@ def __init__(self, config, layer_idx):
                 deepcopy(config),
                 config.hidden_size,
                 config.moe_intermediate_size * config.moe_num_shared_experts,
-                fuse_up_gate=config.fuse_attention_ffn,
+                fuse_up_gate=True,
             )
         use_expert_out_alltoall = use_expert_out_alltoall = "alltoall" in config.moe_multimodal_dispatch_use_allgather
         use_padding = "unpad" not in config.moe_multimodal_dispatch_use_allgather
@@ -367,7 +367,7 @@ def __init__(self, config, layer_idx):
                 config,
                 hidden_size=config.hidden_size,
                 intermediate_size=config.intermediate_size,
-                fuse_up_gate=config.fuse_attention_ffn,
+                fuse_up_gate=True,
             )
 
         if config.sequence_parallel and isinstance(
@@ -544,53 +544,23 @@ def _gen_aoa_config(cls, config: Ernie4_5_MoeConfig):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.mtp_block.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+            f"model.mtp_block.$LAYER_ID.self_attn.q_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.use_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
-                f"model.mtp_block.$LAYER_ID.self_attn.q_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+                f"model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.use_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                    f"model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += (
-                [
-                    f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-                + [
-                    f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-                + [
-                    f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-                + [
-                    f"model.mtp_block.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-            )
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-                f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn",
-                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
-                f"model.mtp_block.$LAYER_ID.mlp.gate_proj.weight^T, model.mtp_block.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+            f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn",
+            f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
+            f"model.mtp_block.$LAYER_ID.mlp.gate_proj.weight^T, model.mtp_block.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         if config.tie_word_embeddings:
             aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"]
@@ -620,83 +590,53 @@ def _gen_inv_aoa_config(cls, config: Ernie4_5_MoeConfig):
             f"{model_prefix}mtp_linear_proj.$LAYER_ID.weight^T -> model.mtp_linear_proj.$LAYER_ID.weight",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_statements += [
-                f"{model_prefix}mtp_block.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.mtp_block.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-                f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight -> model.mtp_block.$LAYER_ID.self_attn.q_proj.weight, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-            ]
-            for x in ("q", "k", "v"):
-                for layer_id in range(config.num_hidden_layers):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight",
-                    ]
-                for layer_id in range(config.num_nextn_predict_layers):
-                    aoa_statements += [
-                        f"model.mtp_block.{layer_id}.self_attn.{x}_proj.weight^T -> model.mtp_block.{layer_id}.self_attn.{x}_proj.weight",
-                    ]
-            if config.use_bias:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+            f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight -> model.mtp_block.$LAYER_ID.self_attn.q_proj.weight, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for x in ("q", "k", "v"):
+            for layer_id in range(config.num_hidden_layers):
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                    f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias -> model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
-
-        if not config.fuse_attention_ffn:
-            aoa_statements += (
-                [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                    for y in ("gate", "up")
-                ]
-                + [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight"
-                    for y in ("gate", "up")
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight",
                 ]
-                + [
-                    f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
-                    for y in ("gate", "up")
-                ]
-                + [
-                    f"{model_prefix}mtp_block.$LAYER_ID.mlp.{y}_proj.weight^T -> model.mtp_block.$LAYER_ID.mlp.{y}_proj.weight"
-                    for y in ("gate", "up")
+            for layer_id in range(config.num_nextn_predict_layers):
+                aoa_statements += [
+                    f"model.mtp_block.{layer_id}.self_attn.{x}_proj.weight^T -> model.mtp_block.{layer_id}.self_attn.{x}_proj.weight",
                 ]
-            )
-        else:
+        if config.use_bias:
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
-                f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn",
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
-                f"{model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight -> model.mtp_block.$LAYER_ID.mlp.gate_proj.weight, model.mtp_block.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+                f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias -> model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            # mlp
-            for layer_id in range(config.moe_layer_start_index):
-                for y in ("gate", "up"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.mlp.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.{y}_proj.weight",
-                    ]
-            # experts
-            for layer_id in range(config.moe_layer_start_index, config.num_hidden_layers):
-                for y in ("gate", "up"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight"
-                    ]
-                    for expert_id in range(config.moe_num_experts):
-                        aoa_statements += [
-                            f"model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight"
-                        ]
-            # mtp
-            for layer_id in range(config.num_nextn_predict_layers):
-                for y in ("gate", "up"):
+
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+            f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn",
+            f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
+            f"{model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight -> model.mtp_block.$LAYER_ID.mlp.gate_proj.weight, model.mtp_block.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+        ]
+        # mlp
+        for layer_id in range(config.moe_layer_start_index):
+            for y in ("gate", "up"):
+                aoa_statements += [
+                    f"model.layers.{layer_id}.mlp.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.{y}_proj.weight",
+                ]
+        # experts
+        for layer_id in range(config.moe_layer_start_index, config.num_hidden_layers):
+            for y in ("gate", "up"):
+                aoa_statements += [
+                    f"model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight"
+                ]
+                for expert_id in range(config.moe_num_experts):
                     aoa_statements += [
-                        f"model.mtp_block.{layer_id}.mlp.{y}_proj.weight^T -> model.mtp_block.{layer_id}.mlp.{y}_proj.weight"
+                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight"
                     ]
+        # mtp
+        for layer_id in range(config.num_nextn_predict_layers):
+            for y in ("gate", "up"):
+                aoa_statements += [
+                    f"model.mtp_block.{layer_id}.mlp.{y}_proj.weight^T -> model.mtp_block.{layer_id}.mlp.{y}_proj.weight"
+                ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py b/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py
index 984fdd9ad42..fa2a3fd33ee 100644
--- a/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py
+++ b/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py
@@ -44,7 +44,6 @@
         "pad_token_id": 0,
         "use_cache": False,
         "recompute": False,
-        "use_flash_attention": True,
         "use_pure_fp16": False,
     },
 }
@@ -75,12 +74,11 @@ def __init__(
         initializer_range=0.02,  # no use
         rms_norm_eps=1e-6,
         use_cache=False,
-        use_flash_attention=True,
         use_sparse_flash_attn=True,
         use_var_len_flash_attn=False,
         recompute_use_reentrant=False,
         use_rmsnorm=True,
-        fuse_rms_norm=False,
+        fuse_rms_norm=True,
         fuse_ln=False,
         pad_token_id=0,
         bos_token_id=1,
@@ -93,7 +91,7 @@ def __init__(
         weight_share_add_bias=True,
         max_sequence_length=None,
         ignored_index=-100,
-        add_tail_layers=False,
+        num_empty_layers_add_in_tail=False,
         attention_probs_dropout_prob=0.0,
         hidden_dropout_prob=0.0,
         compression_ratio: float = 1.0,
@@ -120,7 +118,6 @@ def __init__(
             num_attention_heads (int): Number of attention heads for each attention layer
             rms_norm_eps (float): The epsilon used by the RMS normalization layers
             use_cache (bool): Whether to use caching for faster generation (decoding)
-            use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
             use_sparse_flash_attn (bool): Whether to use sparse FlashAttention
             use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention
             recompute_use_reentrant (bool): Whether to use reentrant checkpointing
@@ -137,7 +134,7 @@ def __init__(
             weight_share_add_bias (bool): Whether to share bias weights in certain layers
             max_sequence_length (int): Maximum sequence length for positional embeddings
             ignored_index (int): Target value that is ignored during loss computation
-            add_tail_layers (int): Whether to add additional layers at the end
+            num_empty_layers_add_in_tail (int): Whether to add additional layers at the end
             attention_probs_dropout_prob (float): Dropout probability for attention weights
             hidden_dropout_prob (float): Dropout probability for hidden layers
             compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
@@ -172,7 +169,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.use_flash_attention = use_flash_attention
         self.use_sparse_flash_attn = use_sparse_flash_attn
         self.recompute_use_reentrant = recompute_use_reentrant
         self.use_var_len_flash_attn = use_var_len_flash_attn
@@ -193,7 +189,7 @@ def __init__(
         self.fuse_softmax_mask = fuse_softmax_mask
 
         self.ignored_index = ignored_index
-        self.add_tail_layers = add_tail_layers
+        self.num_empty_layers_add_in_tail = num_empty_layers_add_in_tail
 
         self.skip_recompute_ops = dict()
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py
index b76101bf5f8..190d3894023 100644
--- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py
+++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py
@@ -748,10 +748,7 @@ def set_attn_func(self):
         Selects between flash/core attention.
         """
         config = self.config
-        if config.use_flash_attention:
-            self.attn_func = self._flash_attention_wrapper
-        else:
-            self.attn_func = self.core_attn
+        self.attn_func = self._flash_attention_wrapper
 
         if config.cachekv_quant:
             from paddleslim.common.wrapper_function import FuncWrapper
diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py
index 8948ea0f545..f2753298828 100644
--- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py
+++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py
@@ -1542,9 +1542,7 @@ def get_decoder(self):
 
     def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
         """Avoid using attention_mask with flash_attn on generation."""
-        if self.config.use_flash_attention:
-            return None
-        return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id)
+        return None
 
     def prepare_inputs_for_generation(
         self,
diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py
index 0a3be64306e..70915b44816 100644
--- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py
+++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py
@@ -103,7 +103,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1):
         config (Config): Model configuration object containing:
             - num_hidden_layers (int): Total number of transformer layers
             - virtual_pipeline_model_parallel_size (int): Virtual pipeline parallelism degree
-            - add_tail_layers (int): Additional tail layers to append
+            - num_empty_layers_add_in_tail (int): Additional tail layers to append
         skip_recompute_num (int): Number of layers per virtual pipeline stage
             to exclude from recomputation. Defaults to -1 (auto-configure).
 
@@ -122,7 +122,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1):
     assert pp_size > 1, (
         "Only support pipeline parallel, " f"pp_size must be greater than 1, but got pp_size: {pp_size}"
     )
-    layer_num = config.num_hidden_layers + config.add_tail_layers
+    layer_num = config.num_hidden_layers + config.num_empty_layers_add_in_tail
 
     if skip_recompute_num == -1:
         # select all layers to skip recompute
diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py
index b9a73730054..92eaad1d9db 100644
--- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py
+++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py
@@ -1226,7 +1226,7 @@ def _need_full_recompute(layer_idx):
                 f"model.layers.{i}",
             )
 
-        for i in range(config.add_tail_layers):
+        for i in range(config.num_empty_layers_add_in_tail):
             self.add_sequential_layer(
                 LayerDesc(
                     EmptyLayer,
@@ -1266,7 +1266,9 @@ def _need_full_recompute(layer_idx):
             pass
         if (
             seg_method == "layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer"
-            and (config.num_hidden_layers + config.add_tail_layers) % get_hcg().topology().get_dim_size("pipe") != 0
+            and (config.num_hidden_layers + config.num_empty_layers_add_in_tail)
+            % get_hcg().topology().get_dim_size("pipe")
+            != 0
         ):
             seg_method = "uniform"
         logger.info(f"using recompute_interval={recompute_interval}, seg_method={seg_method}")
diff --git a/paddleformers/transformers/gemma3_text/modeling.py b/paddleformers/transformers/gemma3_text/modeling.py
index 0510b211d32..ae07f6af4c0 100644
--- a/paddleformers/transformers/gemma3_text/modeling.py
+++ b/paddleformers/transformers/gemma3_text/modeling.py
@@ -202,7 +202,6 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int):
         self.attention_dropout = config.attention_dropout
         self.is_causal = not config.use_bidirectional_attention
         self.attn_implementation = config._attn_implementation
-        self.fuse_attention_qkv = config.fuse_attention_qkv
 
         self.num_heads = config.num_attention_heads
         self.num_key_value_heads = config.num_key_value_heads
@@ -223,36 +222,13 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int):
         kv_hidden_size = config.num_key_value_heads * self.head_dim
         q_hidden_size = config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
             config.hidden_size,
@@ -281,40 +257,26 @@ def forward(
         use_cache: bool = False,
         attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
     ) -> tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[tuple[paddle.Tensor]]]:
-        if not self.fuse_attention_qkv:
-            if self.config.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-
-            hidden_shape = (bsz, q_len, -1, self.head_dim)
-
-            query_states = self.q_proj(hidden_states).reshape(hidden_shape)
-            key_states = self.k_proj(hidden_states).reshape(hidden_shape)
-            value_states = self.v_proj(hidden_states).reshape(hidden_shape)
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.config.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.config.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            query_states = query_states.reshape([0, 0, -1, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        query_states = query_states.reshape([0, 0, -1, self.head_dim])
 
         query_states = self.q_norm(query_states)
         key_states = self.k_norm(key_states)
@@ -364,7 +326,7 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int):
         self.layer_idx = layer_idx
         self.attention_type = config.layer_types[layer_idx]
         self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx)
-        self.mlp = Gemma3MLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Gemma3MLP(config, fuse_up_gate=True)
         self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
         self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps)
@@ -454,35 +416,18 @@ def _gen_aoa_config(cls, config: Gemma3TextConfig):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.attention_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
-                ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.attention_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.attention_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         return aoa_config
 
@@ -507,41 +452,24 @@ def _gen_inv_aoa_config(cls, config: Gemma3TextConfig):
             f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.attention_bias:
-                aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
-                ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}"
-            ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        aoa_statements += [
+            f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
+            for layer_id in range(config.num_hidden_layers)
+            for x in ("q", "k", "v")
+        ]
+        if config.attention_bias:
             aoa_statements += [
-                f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.num_hidden_layers)
-                for x in ("q", "k", "v")
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.attention_bias:
-                aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                for y in ("gate", "up")
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.0.mlp.up_gate_proj.weight -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn",
-                "model.layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
-                "model.layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
-            ]
+        aoa_statements += [
+            f"{model_prefix}layers.0.mlp.up_gate_proj.weight -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn",
+            "model.layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
+            "model.layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
+        ]
 
         aoa_config = {"aoa_statements": aoa_statements}
         return aoa_config
diff --git a/paddleformers/transformers/glm4_moe/configuration.py b/paddleformers/transformers/glm4_moe/configuration.py
index 645eca58c6a..9790aa3d065 100644
--- a/paddleformers/transformers/glm4_moe/configuration.py
+++ b/paddleformers/transformers/glm4_moe/configuration.py
@@ -154,7 +154,7 @@ def __init__(
         pp_seg_method="layer:Glm4MoeDecoderLayer",
         disable_ffn_model_parallel=False,
         scoring_func="sigmoid",
-        aux_loss_alpha=0.0001,
+        router_aux_loss_coef=0.0001,
         seq_aux=True,
         topk_method="noaux_tc",
         using_flex_token=True,
@@ -200,7 +200,7 @@ def __init__(
         self.norm_topk_prob = norm_topk_prob
         self.use_qk_norm = use_qk_norm
         self.scoring_func = scoring_func
-        self.aux_loss_alpha = aux_loss_alpha
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.seq_aux = seq_aux
         self.topk_method = topk_method
         self.using_flex_token = using_flex_token
diff --git a/paddleformers/transformers/glm4_moe/modeling.py b/paddleformers/transformers/glm4_moe/modeling.py
index e6b632ae494..e4db9127953 100644
--- a/paddleformers/transformers/glm4_moe/modeling.py
+++ b/paddleformers/transformers/glm4_moe/modeling.py
@@ -165,7 +165,6 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
         self.tensor_parallel = config.tensor_model_parallel_size > 1
         self.sequence_parallel = config.sequence_parallel
         self.attention_bias = config.attention_bias
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -181,36 +180,13 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                self.hidden_size,
-                q_hidden_size,
-                has_bias=self.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                self.hidden_size,
-                kv_hidden_size,
-                has_bias=self.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                self.hidden_size,
-                kv_hidden_size,
-                has_bias=self.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                self.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=self.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            self.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=self.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
             self.hidden_size,
@@ -248,42 +224,27 @@ def forward(
         batch_size: Optional[int] = None,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
 
-        if not self.fuse_attention_qkv:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-            query_states = query_states.reshape([bsz, q_len, -1, self.head_dim])
-            key_states = key_states.reshape([bsz, q_len, -1, self.head_dim])
-            value_states = value_states.reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         if self.use_qk_norm:  # main diff from Llama
             query_states = self.q_norm(query_states)
@@ -434,9 +395,7 @@ def __init__(self, config):
             config.sequence_parallel = False
         self.experts = nn.LayerList(
             [
-                Glm4MoeMLP(
-                    config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn
-                )
+                Glm4MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True)
                 for _ in range(config.n_routed_experts)
             ]
         )
@@ -444,7 +403,7 @@ def __init__(self, config):
         self.shared_experts = Glm4MoeMLP(
             config=config,
             intermediate_size=config.moe_intermediate_size * config.n_shared_experts,
-            fuse_up_gate=config.fuse_attention_ffn,
+            fuse_up_gate=True,
         )
 
     def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tensor, topk_weights: paddle.Tensor):
@@ -548,7 +507,7 @@ def __init__(self, config):
             expert_kwargs={
                 "config": mlp_config,
                 "intermediate_size": mlp_config.moe_intermediate_size,
-                "fuse_up_gate": config.fuse_attention_ffn,
+                "fuse_up_gate": True,
             },
             gate=gate,
             moe_group=moe_group,
@@ -568,13 +527,13 @@ def __init__(self, config):
         self.shared_experts = Glm4MoeMLP(
             config=config,
             intermediate_size=config.moe_intermediate_size * config.n_shared_experts,
-            fuse_up_gate=config.fuse_attention_ffn,
+            fuse_up_gate=True,
         )
 
     def forward(self, hidden_states):
         final_hidden_states, l_aux, _ = super().forward(hidden_states)
-        if self.training and self.config.aux_loss_alpha > 0.0:
-            l_aux = l_aux * self.config.aux_loss_alpha
+        if self.training and self.config.router_aux_loss_coef > 0.0:
+            l_aux = l_aux * self.config.router_aux_loss_coef
             final_hidden_states = AddAuxiliaryLoss.apply(final_hidden_states, l_aux)
         final_hidden_states = final_hidden_states + self.shared_experts(hidden_states)
         return final_hidden_states
@@ -612,7 +571,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: int):
                 )
             )
         else:
-            self.mlp = Glm4MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn)
+            self.mlp = Glm4MoeMLP(config, fuse_up_gate=True)
 
         self.input_layernorm = GeneralNorm.create(
             config=config,
@@ -871,15 +830,9 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig):
         aoa_config["aoa_statements"] += [
             f"model.layers.0.mlp.down_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.down_proj.weight"
         ]
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.0.mlp.gate_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.gate_proj.weight",
-                f"model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_proj.weight",
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.0.mlp.gate_proj.weight^T, model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.0.mlp.gate_proj.weight^T, model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # layer0 - layer_num_hidden_layers
         for layer_idx in reversed(range(0, num_hidden_layers)):
@@ -892,19 +845,13 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig):
                 f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight",
             ]
             # attention qkv
-            if not config.fuse_attention_qkv:
-                aoa_config["aoa_statements"] += [
-                    f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix_offset}.self_attn.{x}_proj.weight"
-                    for x in ("q", "k", "v")
-                ]
-            else:
+            aoa_config["aoa_statements"] += [
+                f"{prefix}.self_attn.q_proj.weight^T, {prefix}.self_attn.k_proj.weight^T, {prefix}.self_attn.v_proj.weight^T -> {prefix_offset}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+            ]
+            if config.attention_bias:
                 aoa_config["aoa_statements"] += [
-                    f"{prefix}.self_attn.q_proj.weight^T, {prefix}.self_attn.k_proj.weight^T, {prefix}.self_attn.v_proj.weight^T -> {prefix_offset}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                    f"{prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias -> {prefix_offset}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
                 ]
-                if config.attention_bias:
-                    aoa_config["aoa_statements"] += [
-                        f"{prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias -> {prefix_offset}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                    ]
         # layer1 - layer_num_hidden_layers
         for layer_idx in reversed(range(1, num_hidden_layers)):
             layer_idx_offset = layer_idx + num_head_empty_layers
@@ -925,33 +872,24 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig):
                 ]
 
             # FFN
-            if not config.fuse_attention_ffn:
-                aoa_config["aoa_statements"] += [
-                    f"{prefix}.mlp.shared_experts.{p}_proj.weight^T -> {prefix_offset}.mlp.shared_experts.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ] + [
-                    f"{prefix}.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.{p}_proj.weight"
-                    for p in ("gate", "up")
-                ]
-            else:
-                aoa_config["aoa_statements"] += [
-                    f"{prefix}.mlp.shared_experts.gate_proj.weight^T, {prefix}.mlp.shared_experts.up_proj.weight^T -> {prefix_offset}.mlp.shared_experts.up_gate_proj.weight, fused_ffn",
-                ]
-                if is_fleet:
-                    if using_sonic_moe:
-                        aoa_config["aoa_statements"] += [
-                            f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0",
-                        ]
-                    else:
-                        aoa_config["aoa_statements"] += [
-                            f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1",
-                        ]
-
+            aoa_config["aoa_statements"] += [
+                f"{prefix}.mlp.shared_experts.gate_proj.weight^T, {prefix}.mlp.shared_experts.up_proj.weight^T -> {prefix_offset}.mlp.shared_experts.up_gate_proj.weight, fused_ffn",
+            ]
+            if is_fleet:
+                if using_sonic_moe:
+                    aoa_config["aoa_statements"] += [
+                        f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0",
+                    ]
                 else:
                     aoa_config["aoa_statements"] += [
-                        f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
+                        f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1",
                     ]
 
+            else:
+                aoa_config["aoa_statements"] += [
+                    f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
+                ]
+
             if is_fleet and (config.moe_grouped_gemm or using_sonic_moe):
                 ep_weight1 = []
                 ep_weight2 = []
@@ -1000,17 +938,11 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig):
         aoa_statements += [
             f"{model_prefix}layers.{num_head_empty_layers}.mlp.down_proj.weight^T -> model.layers.0.mlp.down_proj.weight",
         ]
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
-                f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight -> model.layers.{num_head_empty_layers}.mlp.gate_proj.weight, model.layers.{num_head_empty_layers}.mlp.up_proj.weight, fused_ffn",
-                f"model.layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
-                f"model.layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
-            ]
+        aoa_statements += [
+            f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight -> model.layers.{num_head_empty_layers}.mlp.gate_proj.weight, model.layers.{num_head_empty_layers}.mlp.up_proj.weight, fused_ffn",
+            f"model.layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight",
+            f"model.layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight",
+        ]
 
         # layer 0 -> layer num_hidden_layers-1
         for layer_idx in range(0, num_hidden_layers):
@@ -1023,23 +955,16 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig):
                 f"{prefix_offset}.post_attention_layernorm.weight -> {prefix}.post_attention_layernorm.weight",
                 f"{prefix_offset}.self_attn.o_proj.weight^T -> {prefix}.self_attn.o_proj.weight",
             ]
-            if not config.fuse_attention_qkv:
-                aoa_statements += [
-                    f"{prefix_offset}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight"
-                    for x in ("q", "k", "v")
-                ]
-            else:
-                aoa_statements += [
-                    f"{prefix_offset}.self_attn.qkv_proj.weight -> {prefix}.self_attn.q_proj.weight, {prefix}.self_attn.k_proj.weight, {prefix}.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-                ]
+            aoa_statements += [
+                f"{prefix_offset}.self_attn.qkv_proj.weight -> {prefix}.self_attn.q_proj.weight, {prefix}.self_attn.k_proj.weight, {prefix}.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+            ]
+            aoa_statements += [
+                f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight" for x in ("q", "k", "v")
+            ]
+            if config.attention_bias:
                 aoa_statements += [
-                    f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight"
-                    for x in ("q", "k", "v")
+                    f"{prefix_offset}.self_attn.qkv_proj.bias -> {prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0",
                 ]
-                if config.attention_bias:
-                    aoa_statements += [
-                        f"{prefix_offset}.self_attn.qkv_proj.bias -> {prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0",
-                    ]
 
         # layer 1 -> layer num_hidden_layers-1
         for layer_idx in range(1, num_hidden_layers):
@@ -1068,57 +993,42 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig):
                 f"{prefix_offset}.mlp.shared_experts.down_proj.weight^T -> {prefix}.mlp.shared_experts.down_proj.weight",
             ]
 
-            if not config.fuse_attention_ffn:
+            aoa_statements += [
+                f"{prefix_offset}.mlp.shared_experts.up_gate_proj.weight -> {prefix_offset}.mlp.shared_experts.gate_proj.weight, {prefix_offset}.mlp.shared_experts.up_proj.weight, fused_ffn",
+                f"{prefix_offset}.mlp.shared_experts.gate_proj.weight^T -> {prefix}.mlp.shared_experts.gate_proj.weight",
+                f"{prefix_offset}.mlp.shared_experts.up_proj.weight^T -> {prefix}.mlp.shared_experts.up_proj.weight",
+            ]
+            if is_fleet:
+                if using_sonic_moe:
+                    aoa_statements += [
+                        f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=0"
+                        for expert_id in range(config.n_routed_experts)
+                    ]
+                else:
+                    aoa_statements += [
+                        f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=1"
+                        for expert_id in range(config.n_routed_experts)
+                    ]
+            else:
+                aoa_statements += [
+                    f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn"
+                    for expert_id in range(config.n_routed_experts)
+                ]
+            if not using_sonic_moe:
                 aoa_statements += (
                     [
-                        f"{prefix_offset}.mlp.shared_experts.{y}_proj.weight^T -> {prefix}.mlp.shared_experts.{y}_proj.weight"
-                        for y in ("gate", "up")
+                        f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.down_proj.weight"
+                        for expert_id in range(config.n_routed_experts)
                     ]
                     + [
-                        f"{prefix_offset}.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> {prefix}.mlp.experts.$EXPERT_ID.{y}_proj.weight"
-                        for y in ("gate", "up")
+                        f"{prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.gate_proj.weight"
+                        for expert_id in range(config.n_routed_experts)
                     ]
                     + [
-                        f"{prefix_offset}.mlp.experts.$EXPERT_ID.down_proj.weight^T -> {prefix}.mlp.experts.$EXPERT_ID.down_proj.weight"
-                    ]
-                )
-            else:
-                aoa_statements += [
-                    f"{prefix_offset}.mlp.shared_experts.up_gate_proj.weight -> {prefix_offset}.mlp.shared_experts.gate_proj.weight, {prefix_offset}.mlp.shared_experts.up_proj.weight, fused_ffn",
-                    f"{prefix_offset}.mlp.shared_experts.gate_proj.weight^T -> {prefix}.mlp.shared_experts.gate_proj.weight",
-                    f"{prefix_offset}.mlp.shared_experts.up_proj.weight^T -> {prefix}.mlp.shared_experts.up_proj.weight",
-                ]
-                if is_fleet:
-                    if using_sonic_moe:
-                        aoa_statements += [
-                            f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=0"
-                            for expert_id in range(config.n_routed_experts)
-                        ]
-                    else:
-                        aoa_statements += [
-                            f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=1"
-                            for expert_id in range(config.n_routed_experts)
-                        ]
-                else:
-                    aoa_statements += [
-                        f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn"
+                        f"{prefix_offset}.mlp.experts.{expert_id}.up_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.up_proj.weight"
                         for expert_id in range(config.n_routed_experts)
                     ]
-                if not using_sonic_moe:
-                    aoa_statements += (
-                        [
-                            f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.down_proj.weight"
-                            for expert_id in range(config.n_routed_experts)
-                        ]
-                        + [
-                            f"{prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.gate_proj.weight"
-                            for expert_id in range(config.n_routed_experts)
-                        ]
-                        + [
-                            f"{prefix_offset}.mlp.experts.{expert_id}.up_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.up_proj.weight"
-                            for expert_id in range(config.n_routed_experts)
-                        ]
-                    )
+                )
 
         aoa_config = {"aoa_statements": aoa_statements}
         return aoa_config
diff --git a/paddleformers/transformers/gpt_oss/modeling.py b/paddleformers/transformers/gpt_oss/modeling.py
index 088c08603fb..e17e71317ab 100644
--- a/paddleformers/transformers/gpt_oss/modeling.py
+++ b/paddleformers/transformers/gpt_oss/modeling.py
@@ -590,20 +590,14 @@ def _gen_aoa_config(cls, config: GptOssConfig):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
-                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
+            for x in ("q", "k", "v")
+        ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
+            for x in ("q", "k", "v")
+        ]
 
         return aoa_config
 
@@ -628,25 +622,14 @@ def _gen_inv_aoa_config(cls, config: GptOssConfig):
             f"{model_prefix}layers.$LAYER_ID.mlp.experts.down_proj_bias -> model.layers.$LAYER_ID.mlp.experts.down_proj_bias",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0",
-            ]
-            aoa_statements += [
-                f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.num_hidden_layers)
-                for x in ("q", "k", "v")
-            ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
+            for x in ("q", "k", "v")
+        ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
+            for x in ("q", "k", "v")
+        ]
 
         aoa_config = {"aoa_statements": aoa_statements}
         return aoa_config
diff --git a/paddleformers/transformers/masking_utils.py b/paddleformers/transformers/masking_utils.py
index 4248e271a34..89a5988dd2b 100644
--- a/paddleformers/transformers/masking_utils.py
+++ b/paddleformers/transformers/masking_utils.py
@@ -132,8 +132,8 @@ def create_causal_masks_and_row_indices(
     # Enables the efficient built-in causal mode (is_causal=True)
     # for FA backends (sdpa/flashmask), bypassing manual mask generation.
     # for third-party attention registered via _attn_implementation, default to bypass mask generation.
-    attn_impl = getattr(config, "_attn_implementation", "eager")
-    is_flash_backend = attn_impl != "eager"
+    _attn_implementation = getattr(config, "_attn_implementation", "eager")
+    is_flash_backend = _attn_implementation != "eager"
     is_fully_attended = attention_mask is None or (attention_mask is not None and attention_mask.cast("bool").all())
     if is_flash_backend and is_fully_attended:
         if return_mapping:
@@ -241,8 +241,8 @@ def create_causal_mask_and_row_indices(
         causal_mask = None
         row_indices = attn_mask_startend_row_indices
     else:
-        attn_impl = getattr(config, "_attn_implementation", "eager")
-        is_flash_backend = attn_impl != "eager"
+        _attn_implementation = getattr(config, "_attn_implementation", "eager")
+        is_flash_backend = _attn_implementation != "eager"
 
         # Check if the mask can be safely skipped
         # Condition: Must be Flash Backend AND No extra mask func AND No padding (mask is None or all True)
diff --git a/paddleformers/transformers/paddleocr_vl/configuration.py b/paddleformers/transformers/paddleocr_vl/configuration.py
index 619792cfaf2..8d5f9ff7d64 100644
--- a/paddleformers/transformers/paddleocr_vl/configuration.py
+++ b/paddleformers/transformers/paddleocr_vl/configuration.py
@@ -94,7 +94,6 @@ def __init__(
         use_cache=False,
         use_sparse_flash_attn=False,
         _attn_implementation="eager",
-        fuse_rms_norm=False,
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
@@ -141,7 +140,6 @@ def __init__(
         self.use_cache = use_cache
         self.use_sparse_flash_attn = use_sparse_flash_attn
         self._attn_implementation = _attn_implementation
-        self.fuse_rms_norm = fuse_rms_norm
         self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py
index 2c8fa89b23c..c9ec81fd2cc 100644
--- a/paddleformers/transformers/qwen2/modeling.py
+++ b/paddleformers/transformers/qwen2/modeling.py
@@ -91,7 +91,6 @@ def __init__(self, config: Qwen2Config, layer_idx: int = 0):
         assert config.num_attention_heads // config.num_key_value_heads
 
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -108,36 +107,13 @@ def __init__(self, config: Qwen2Config, layer_idx: int = 0):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=True,
+            config=config,
+            tp_plan="colwise",
+        )
 
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
@@ -160,43 +136,27 @@ def forward(
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        if not self.fuse_attention_qkv:
-            # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-            query_states = query_states.reshape([bsz, q_len, -1, self.head_dim])
-            key_states = key_states.reshape([bsz, q_len, -1, self.head_dim])
-            value_states = value_states.reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
         query_states = query_states.transpose(1, 2)
@@ -240,7 +200,7 @@ def __init__(self, config: Qwen2Config, layer_idx: int):
 
         self.self_attn = Qwen2Attention(config, layer_idx)
 
-        self.mlp = Qwen2MLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Qwen2MLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
@@ -319,33 +279,17 @@ def _gen_aoa_config(cls, config: Qwen2Config):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+        ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # lm_head
         if config.tie_word_embeddings:
@@ -365,42 +309,26 @@ def _gen_inv_aoa_config(cls, config: Qwen2Config):
             f"{model_prefix}norm.weight -> model.norm.weight",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
-            ]
-            for layer_id in range(config.num_hidden_layers):
-                for x in ("q", "k", "v"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                    ]
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-            ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for layer_id in range(config.num_hidden_layers):
+            for x in ("q", "k", "v"):
+                aoa_statements += [
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
+                ]
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+        ]
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                for y in ("gate", "up")
-            ]
-        else:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+        ]
+        for layer_id in range(config.num_hidden_layers):
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+                f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
+                f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
             ]
-            for layer_id in range(config.num_hidden_layers):
-                aoa_statements += [
-                    f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
-                    f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
-                ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
diff --git a/paddleformers/transformers/qwen2_5_vl/modeling.py b/paddleformers/transformers/qwen2_5_vl/modeling.py
index e16afab24f2..9d9815d5b16 100644
--- a/paddleformers/transformers/qwen2_5_vl/modeling.py
+++ b/paddleformers/transformers/qwen2_5_vl/modeling.py
@@ -307,31 +307,15 @@ def _gen_aoa_config(cls, config: Qwen2_5_VLConfig):
         ] + [f"visual.merger.mlp.{x}.bias -> {visual_prefix}merger.mlp.{x}.bias" for x in ("0", "2")]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}",
-                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}, axis=0",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}",
+            f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}, axis=0",
+        ]
 
         # FFN
-        if not config.text_config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # Qwen2_5_VLModel without lm_head
         if cls.base_model_prefix:
@@ -391,41 +375,25 @@ def _gen_inv_aoa_config(cls, config: Qwen2_5_VLConfig):
         ] + [f"{visual_prefix}merger.mlp.{x}.bias -> visual.merger.mlp.{x}.bias" for x in ("0", "2")]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}, axis=0",
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.text_config.num_hidden_layers)
-                for x in ("q", "k", "v")
-            ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
+            f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.bias, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.bias, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}, axis=0",
+        ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
+            for layer_id in range(config.text_config.num_hidden_layers)
+            for x in ("q", "k", "v")
+        ]
 
         # FFN
-        if not config.text_config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn"
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.layers.{layer_id}.mlp.{x}_proj.weight^T -> model.layers.{layer_id}.mlp.{x}_proj.weight"
-                for layer_id in range(config.text_config.num_hidden_layers)
-                for x in ("gate", "up")
-            ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> {llm_prefix}layers.$LAYER_ID.mlp.gate_proj.weight, {llm_prefix}layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn"
+        ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.layers.{layer_id}.mlp.{x}_proj.weight"
+            for layer_id in range(config.text_config.num_hidden_layers)
+            for x in ("gate", "up")
+        ]
 
         # Qwen2_5_VLModel without lm_head
         if cls.base_model_prefix:
@@ -803,7 +771,6 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None
             )
 
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -820,36 +787,13 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=True,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=True,
+            config=config,
+            tp_plan="colwise",
+        )
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
             config.hidden_size,
@@ -872,46 +816,29 @@ def forward(
         attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        if not self.fuse_attention_qkv:
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            query_states = query_states.reshape(bsz, q_len, -1, self.head_dim)
-            key_states = key_states.reshape(bsz, q_len, -1, self.head_dim)
-            value_states = value_states.reshape(bsz, q_len, -1, self.head_dim)
-
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            # mix_layer = mix_layer.reshape(target_shape)
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        # mix_layer = mix_layer.reshape(target_shape)
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
@@ -960,7 +887,7 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: int):
             )
         self.self_attn = Qwen2_5_VLAttention(config, layer_idx)
 
-        self.mlp = Qwen2MLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Qwen2MLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py
index 8d47592755a..dcca6f0ec27 100644
--- a/paddleformers/transformers/qwen2_moe/modeling.py
+++ b/paddleformers/transformers/qwen2_moe/modeling.py
@@ -81,7 +81,6 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int = 0):
         assert config.num_attention_heads // config.num_key_value_heads
 
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -98,36 +97,13 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int = 0):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.qkv_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.qkv_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.qkv_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.qkv_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.qkv_bias,
+            config=config,
+            tp_plan="colwise",
+        )
 
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
@@ -149,43 +125,27 @@ def forward(
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        if not self.fuse_attention_qkv:
-            # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-            query_states = query_states.reshape([bsz, q_len, -1, self.head_dim])
-            key_states = key_states.reshape([bsz, q_len, -1, self.head_dim])
-            value_states = value_states.reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
         query_states = query_states.transpose(1, 2)
@@ -273,15 +233,13 @@ def __init__(self, config):
             )
         self.experts = nn.LayerList(
             [
-                Qwen2MoeMLP(
-                    config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn
-                )
+                Qwen2MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True)
                 for _ in range(self.num_experts)
             ]
         )
 
         self.shared_expert = Qwen2MoeMLP(
-            config, intermediate_size=config.shared_expert_intermediate_size, fuse_up_gate=config.fuse_attention_ffn
+            config, intermediate_size=config.shared_expert_intermediate_size, fuse_up_gate=True
         )
         self.shared_expert_gate = GeneralLinear.create(config.hidden_size, 1, has_bias=False, linear_type="default")
 
@@ -359,7 +317,7 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int):
             self.mlp = Qwen2MoeSparseMoeBlock(config)
         else:
             # num_experts == 0 or this layer is not sparse layer
-            self.mlp = Qwen2MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn)
+            self.mlp = Qwen2MoeMLP(config, fuse_up_gate=True)
 
         self.input_layernorm = GeneralNorm.create(
             config=config,
@@ -528,39 +486,19 @@ def _gen_aoa_config(cls, config: Qwen2MoeConfig):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.qkv_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
-                ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.qkv_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.qkv_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.shared_expert.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.{p}_proj.weight"
-                for p in ("gate", "up")
-            ] + [
-                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn",
-                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn",
+            f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
+        ]
 
         # lm_head
         if config.tie_word_embeddings:
@@ -583,53 +521,33 @@ def _gen_inv_aoa_config(cls, config: Qwen2MoeConfig):
             f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert_gate.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert_gate.weight, dtype='bfloat16'",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.qkv_bias:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for layer_id in range(config.num_hidden_layers):
+            for x in ("q", "k", "v"):
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
                 ]
-        else:
+        if config.qkv_bias:
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            for layer_id in range(config.num_hidden_layers):
-                for x in ("q", "k", "v"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                    ]
-            if config.qkv_bias:
-                aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight"
-                for y in ("gate", "up")
-            ] + [
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
-                for y in ("gate", "up")
-            ]
-        else:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight, fused_ffn",
+            f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
+        ]
+        for layer_id in range(config.num_hidden_layers):
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight, fused_ffn",
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
+                f"model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight",
+                f"model.layers.{layer_id}.mlp.shared_expert.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.up_proj.weight",
             ]
-            for layer_id in range(config.num_hidden_layers):
+            for expert_id in range(config.num_experts):
                 aoa_statements += [
-                    f"model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight",
-                    f"model.layers.{layer_id}.mlp.shared_expert.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.up_proj.weight",
+                    f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
+                    f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
                 ]
-                for expert_id in range(config.num_experts):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
-                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
-                    ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py
index 99506b75679..7e2283343cc 100644
--- a/paddleformers/transformers/qwen3/modeling.py
+++ b/paddleformers/transformers/qwen3/modeling.py
@@ -92,7 +92,6 @@ def __init__(self, config: Qwen3Config, layer_idx: int = 0):
 
         self.tensor_parallel = config.tensor_model_parallel_size > 1
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -109,36 +108,13 @@ def __init__(self, config: Qwen3Config, layer_idx: int = 0):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
 
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
@@ -175,46 +151,29 @@ def forward(
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        if not self.fuse_attention_qkv:
-            # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-            # Add qk norm for Qwen3 model.
-            query_states = self.q_norm(query_states.reshape([bsz, q_len, -1, self.head_dim]))
-            key_states = self.k_norm(key_states.reshape([bsz, q_len, -1, self.head_dim]))
-            value_states = value_states.reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
-            query_states = self.q_norm(query_states)
-            key_states = self.k_norm(key_states)
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
 
         # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
         query_states = query_states.transpose(1, 2)
@@ -257,7 +216,7 @@ def __init__(self, config: Qwen3Config, layer_idx: int):
 
         self.self_attn = Qwen3Attention(config, layer_idx)
 
-        self.mlp = Qwen3MLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Qwen3MLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
@@ -337,35 +296,18 @@ def _gen_aoa_config(cls, config: Qwen3Config):
         }
 
         # attention qkv
-        if not config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.attention_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
-                ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.attention_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.attention_bias:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
         # FFN
-        if not config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # lm_head
         if config.tie_word_embeddings:
@@ -387,48 +329,30 @@ def _gen_inv_aoa_config(cls, config: Qwen3Config):
             f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight",
         ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.attention_bias:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for layer_id in range(config.num_hidden_layers):
+            for x in ("q", "k", "v"):
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
                 ]
-        else:
+        if config.attention_bias:
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            for layer_id in range(config.num_hidden_layers):
-                for x in ("q", "k", "v"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                    ]
-            if config.attention_bias:
-                aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
-                ]
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight"
-                for y in ("gate", "up")
-            ]
-        else:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+        ]
+        for layer_id in range(config.num_hidden_layers):
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
+                f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
+                f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
             ]
-            for layer_id in range(config.num_hidden_layers):
-                aoa_statements += [
-                    f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
-                    f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
-                ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
-
         aoa_config = {"aoa_statements": aoa_statements}
         return aoa_config
 
diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py
index b1e3d2f2f8e..4d225049958 100644
--- a/paddleformers/transformers/qwen3_moe/modeling.py
+++ b/paddleformers/transformers/qwen3_moe/modeling.py
@@ -130,7 +130,6 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int = 0):
 
         self.tensor_parallel = config.tensor_model_parallel_size > 1
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -147,36 +146,13 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int = 0):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
 
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
@@ -212,46 +188,29 @@ def forward(
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        if not self.fuse_attention_qkv:
-            # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-            # Add qk norm for Qwen3MoE model.
-            query_states = self.q_norm(query_states.reshape([bsz, q_len, -1, self.head_dim]))
-            key_states = self.k_norm(key_states.reshape([bsz, q_len, -1, self.head_dim]))
-            value_states = value_states.reshape([bsz, q_len, -1, self.head_dim])
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
-            query_states = self.q_norm(query_states)
-            key_states = self.k_norm(key_states)
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+        query_states = self.q_norm(query_states)
+        key_states = self.k_norm(key_states)
 
         # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
         query_states = query_states.transpose(1, 2)
@@ -339,9 +298,7 @@ def __init__(self, config):
             )
         self.experts = nn.LayerList(
             [
-                Qwen3MoeMLP(
-                    config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn
-                )
+                Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True)
                 for _ in range(self.num_experts)
             ]
         )
@@ -434,7 +391,7 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int):
             )
         else:
             # num_experts == 0 or this layer is not sparse layer
-            self.mlp = Qwen3MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn)
+            self.mlp = Qwen3MoeMLP(config, fuse_up_gate=True)
 
         self.input_layernorm = GeneralNorm.create(
             config=config,
@@ -787,46 +744,29 @@ def _gen_aoa_config(cls, config: Qwen3MoeConfig):
             ]
 
         # attention qkv
-        if not config.fuse_attention_qkv:
+        aoa_config["aoa_statements"] += [
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
+        ]
+        if config.attention_bias:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
-            if config.attention_bias:
+
+        # FFN
+        if getattr(cls, "is_fleet", False):
+            if using_sonic_moe:
                 aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
+                    f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0",
                 ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
-            ]
-            if config.attention_bias:
+            else:
                 aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+                    f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1",
                 ]
 
-        # FFN
-        if not config.fuse_attention_ffn:
+        else:
             aoa_config["aoa_statements"] += [
-                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight"
-                for p in ("gate", "up")
+                f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
             ]
-        else:
-            if getattr(cls, "is_fleet", False):
-                if using_sonic_moe:
-                    aoa_config["aoa_statements"] += [
-                        f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0",
-                    ]
-                else:
-                    aoa_config["aoa_statements"] += [
-                        f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1",
-                    ]
-
-            else:
-                aoa_config["aoa_statements"] += [
-                    f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
-                ]
 
         if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe):
             for layer_idx in range(0, config.num_hidden_layers):
@@ -881,77 +821,55 @@ def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig):
                 f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight",
             ]
 
-        if not config.fuse_attention_qkv:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            if config.attention_bias:
+        aoa_statements += [
+            f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+        ]
+        for layer_id in range(config.num_hidden_layers):
+            for x in ("q", "k", "v"):
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                    for x in ("q", "k", "v")
+                    f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
                 ]
-        else:
+        if config.attention_bias:
             aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
+                f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
             ]
+
+        if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe):
             for layer_id in range(config.num_hidden_layers):
-                for x in ("q", "k", "v"):
-                    aoa_statements += [
-                        f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                    ]
-            if config.attention_bias:
+                ep_weight1 = []
+                ep_weight2 = []
+                for expert_id in range(num_experts):
+                    ep_weight1.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight")
+                    ep_weight2.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight")
+                group_gemm1 = ",".join(ep_weight1)
+                group_gemm2 = ",".join(ep_weight2)
                 aoa_statements += [
-                    f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+                    f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight1 -> {group_gemm1}, axis=0"
+                    f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight2 -> {group_gemm2}, axis=0"
                 ]
 
-        if not config.fuse_attention_ffn:
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
-                for y in ("gate", "up")
-            ]
-            aoa_statements += [
-                f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight",
-            ]
-        else:
-            if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe):
-                for layer_id in range(config.num_hidden_layers):
-                    ep_weight1 = []
-                    ep_weight2 = []
-                    for expert_id in range(num_experts):
-                        ep_weight1.append(
-                            f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight"
-                        )
-                        ep_weight2.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight")
-                    group_gemm1 = ",".join(ep_weight1)
-                    group_gemm2 = ",".join(ep_weight2)
-                    aoa_statements += [
-                        f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight1 -> {group_gemm1}, axis=0"
-                        f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight2 -> {group_gemm2}, axis=0"
-                    ]
-
-            for layer_id in range(config.num_hidden_layers):
-                for expert_id in range(num_experts):
-                    if getattr(cls, "is_fleet", False):
-                        if using_sonic_moe:
-                            aoa_statements += [
-                                f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=0",
-                            ]
-                        else:
-                            aoa_statements += [
-                                f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=1",
-                            ]
-                    else:
+        for layer_id in range(config.num_hidden_layers):
+            for expert_id in range(num_experts):
+                if getattr(cls, "is_fleet", False):
+                    if using_sonic_moe:
                         aoa_statements += [
-                            f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn",
+                            f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=0",
                         ]
-
-                    if not using_sonic_moe:
+                    else:
                         aoa_statements += [
-                            f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
-                            f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
-                            f"model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight",
+                            f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=1",
                         ]
+                else:
+                    aoa_statements += [
+                        f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn",
+                    ]
+
+                if not using_sonic_moe:
+                    aoa_statements += [
+                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
+                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
+                        f"model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight",
+                    ]
 
         if config.tie_word_embeddings:
             aoa_statements += ["lm_head.weight -> _"]
diff --git a/paddleformers/transformers/qwen3_next/modeling.py b/paddleformers/transformers/qwen3_next/modeling.py
index 5780ac0416f..6014a6e0380 100644
--- a/paddleformers/transformers/qwen3_next/modeling.py
+++ b/paddleformers/transformers/qwen3_next/modeling.py
@@ -288,9 +288,12 @@ def extra_repr(self):
 class Qwen3NextAttention(Qwen3MoeAttention):
     def __init__(self, config: Qwen3NextConfig, layer_idx: int):
         super().__init__(config, layer_idx)
-        self.q_proj = GeneralLinear.create(
+        kv_hidden_size = self.config.num_key_value_heads * self.head_dim
+        q_hidden_size = self.config.num_attention_heads * self.head_dim * 2
+
+        self.qkv_proj = GeneralLinear.create(
             config.hidden_size,
-            config.num_attention_heads * self.head_dim * 2,
+            q_hidden_size + 2 * kv_hidden_size,
             has_bias=config.attention_bias,
             config=config,
             tp_plan="colwise",
@@ -315,16 +318,26 @@ def forward(
         cache_position: Optional[Tensor] = None,
         **kwargs,
     ) -> tuple[Tensor, Optional[Tensor]]:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
+        mix_layer = self.qkv_proj(hidden_states)
         if self.sequence_parallel:
             max_sequence_length = self.config.max_sequence_length
             bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
             q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups * 2 + 2) * self.head_dim,
+            ]
         else:
             bsz, q_len, _ = hidden_states.shape
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups * 2 + 2) * self.head_dim]
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim * 2, self.head_dim, self.head_dim],
+            axis=-1,
+        )
 
         query_states, gate = paddle.chunk(query_states.view(bsz, q_len, -1, self.head_dim * 2), chunks=2, dim=-1)
         gate = gate.reshape(bsz, q_len, -1)
@@ -933,13 +946,20 @@ def _gen_aoa_config(cls, config: Qwen3NextConfig):
             f"model.layers.$LAYER_ID.linear_attn.in_proj_qkvz.weight^T -> {model_prefix}layers.$LAYER_ID.linear_attn.in_proj_qkvz.weight",
             f"model.layers.$LAYER_ID.linear_attn.norm.weight -> {model_prefix}layers.$LAYER_ID.linear_attn.norm.weight",
             f"model.layers.$LAYER_ID.linear_attn.out_proj.weight^T -> {model_prefix}layers.$LAYER_ID.linear_attn.out_proj.weight",
+            f"model.layers.$LAYER_ID.self_attn.o_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.o_proj.weight",
+            f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight",
+            f"model.layers.$LAYER_ID.mlp.shared_expert.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.down_proj.weight",
         ]
 
-        # self_attn
+        # attention qkv
         aoa_statements += [
-            f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-            for x in ("q", "k", "v", "o")
+            f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}",
         ]
+        if config.attention_bias:
+            aoa_statements += [
+                f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
+            ]
+
         aoa_statements += [
             f"model.layers.$LAYER_ID.self_attn.{x}_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_norm.weight"
             for x in ("q", "k")
@@ -947,12 +967,8 @@ def _gen_aoa_config(cls, config: Qwen3NextConfig):
 
         # experts
         aoa_statements += [
-            f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{x}_proj.weight"
-            for x in ("gate", "up", "down")
-        ]
-        aoa_statements += [
-            f"model.layers.$LAYER_ID.mlp.shared_expert.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.{x}_proj.weight"
-            for x in ("gate", "up", "down")
+            f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn",
+            f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn",
         ]
 
         return {"aoa_statements": aoa_statements}
diff --git a/paddleformers/transformers/qwen3_vl/modeling.py b/paddleformers/transformers/qwen3_vl/modeling.py
index 52b76c3620b..473a1bc50fc 100644
--- a/paddleformers/transformers/qwen3_vl/modeling.py
+++ b/paddleformers/transformers/qwen3_vl/modeling.py
@@ -370,30 +370,18 @@ def _gen_aoa_config(cls, config: Qwen3VLConfig):
         ]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
+        aoa_config["aoa_statements"] += [
+            f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
+        ]
+        if config.attention_bias:
             aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
+                f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.k_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
             ]
 
         # FFN
-        if not config.text_config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn",
+        ]
 
         # Qwen3_VLModel without lm_head
         if cls._tied_weights_keys:
@@ -471,40 +459,28 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLConfig):
         ]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
-            ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
+        ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight"
+            for layer_id in range(config.text_config.num_hidden_layers)
+            for x in ("q", "k", "v")
+        ]
+        if config.attention_bias:
             aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.text_config.num_hidden_layers)
-                for x in ("q", "k", "v")
+                f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias  -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.k_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
             ]
 
         # FFN
-        if not config.text_config.fuse_attention_ffn:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight^T -> model.language_model.layers.$LAYER_ID.mlp.{p}_proj.weight"
-                for p in ("gate", "up")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn"
-            ]
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.mlp.{x}_proj.weight"
-                for layer_id in range(config.text_config.num_hidden_layers)
-                for x in ("gate", "up")
-            ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> {llm_prefix}layers.$LAYER_ID.mlp.gate_proj.weight, {llm_prefix}layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn"
+        ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.mlp.{x}_proj.weight"
+            for layer_id in range(config.text_config.num_hidden_layers)
+            for x in ("gate", "up")
+        ]
 
         # Qwen3VLModel without lm_head
         if cls._tied_weights_keys:
@@ -915,7 +891,6 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: Optional[int] = None):
         #     )
 
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -932,36 +907,13 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: Optional[int] = None):
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
             config.hidden_size,
@@ -984,46 +936,29 @@ def forward(
         attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        if not self.fuse_attention_qkv:
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            query_states = query_states.reshape(bsz, q_len, -1, self.head_dim)
-            key_states = key_states.reshape(bsz, q_len, -1, self.head_dim)
-            value_states = value_states.reshape(bsz, q_len, -1, self.head_dim)
-
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            # mix_layer = mix_layer.reshape(target_shape)
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        # mix_layer = mix_layer.reshape(target_shape)
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         # apply qk_norm
         query_states = self.q_norm(query_states)
@@ -1067,7 +1002,7 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
         self.hidden_size = config.hidden_size
         self.self_attn = Qwen3VLTextAttention(config, layer_idx)
 
-        self.mlp = Qwen3VLTextMLP(config, fuse_up_gate=config.fuse_attention_ffn)
+        self.mlp = Qwen3VLTextMLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
diff --git a/paddleformers/transformers/qwen3_vl/modeling_fleet.py b/paddleformers/transformers/qwen3_vl/modeling_fleet.py
index a156b10e924..369d7b4e4d2 100644
--- a/paddleformers/transformers/qwen3_vl/modeling_fleet.py
+++ b/paddleformers/transformers/qwen3_vl/modeling_fleet.py
@@ -336,7 +336,6 @@ class Qwen3VLTextProvider(GPTModelProvider):
     max_sequence_length: int = 262144
     multimodal_embedding: bool = False
     _save_to_hf: bool = False
-    use_flash_attention: bool = True
     use_fused_linear_cross_entropy: bool = True
     high_precision_rope: bool = True
     moe_grouped_gemm: bool = True
@@ -393,7 +392,6 @@ class Qwen3VLVisionProvider(TransformerConfig):
     class_token_len: int = 1
     high_precision_rope: bool = True
     # _save_to_hf: bool = False
-    # use_flash_attention: bool = True
     # use_fused_linear_cross_entropy: bool = True
     # fuse_linear: bool = True
     # transform_rules: dict = field(default_factory=lambda: {
diff --git a/paddleformers/transformers/qwen3_vl_moe/modeling.py b/paddleformers/transformers/qwen3_vl_moe/modeling.py
index 32e57a562d0..0e300ba4504 100644
--- a/paddleformers/transformers/qwen3_vl_moe/modeling.py
+++ b/paddleformers/transformers/qwen3_vl_moe/modeling.py
@@ -354,6 +354,12 @@ def _gen_aoa_config(cls, config: Qwen3VLMoeConfig):
             f"model.language_model.layers.{layer_id}.self_attn.q_proj.weight^T, model.language_model.layers.{layer_id}.self_attn.k_proj.weight^T, model.language_model.layers.{layer_id}.self_attn.v_proj.weight^T -> {llm_prefix}{layer_id + 1}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
             for layer_id in range(config.text_config.num_hidden_layers)
         ]
+        if config.attention_bias:
+            aoa_config["aoa_statements"] += [
+                f"model.language_model.layers.{layer_id}.self_attn.q_proj.bias, model.language_model.layers.{layer_id}.self_attn.k_proj.bias, model.language_model.layers.{layer_id}.self_attn.v_proj.bias -> {llm_prefix}{layer_id + 1}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
+                for layer_id in range(config.text_config.num_hidden_layers)
+            ]
+
         aoa_config["aoa_statements"] += [
             lm_state
             for layer_id in range(config.text_config.num_hidden_layers)
@@ -535,6 +541,12 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLMoeConfig):
             f"{llm_prefix}{layer_id + 1}.self_attn.qkv_proj.weight  -> model.language_model.layers.{layer_id}.self_attn.q_proj.weight, model.language_model.layers.{layer_id}.self_attn.k_proj.weight, model.language_model.layers.{layer_id}.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}"
             for layer_id in range(config.text_config.num_hidden_layers)
         ]
+        if config.attention_bias:
+            aoa_config["aoa_statements"] += [
+                f"{llm_prefix}{layer_id + 1}.self_attn.qkv_proj.bias  -> model.language_model.layers.{layer_id}.self_attn.q_proj.bias, model.language_model.layers.{layer_id}.self_attn.k_proj.bias, model.language_model.layers.{layer_id}.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}"
+                for layer_id in range(config.text_config.num_hidden_layers)
+            ]
+
         aoa_config["aoa_statements"] += [
             f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight"
             for layer_id in range(config.text_config.num_hidden_layers)
@@ -653,19 +665,9 @@ def _gen_aoa_config(cls, config: Qwen3VLMoeConfig):
         ]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
-            ]
+        aoa_config["aoa_statements"] += [
+            f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}"
+        ]
 
         # Qwen3_VLMoeModel without lm_head
         if cls._tied_weights_keys:
@@ -745,24 +747,14 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLMoeConfig):
         ]
 
         # attention qkv
-        if not config.text_config.fuse_attention_qkv:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
-                for x in ("q", "k", "v")
-            ]
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias"
-                for x in ("q", "k", "v")
-            ]
-        else:
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
-            ]
-            aoa_config["aoa_statements"] += [
-                f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight"
-                for layer_id in range(config.text_config.num_hidden_layers)
-                for x in ("q", "k", "v")
-            ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight  -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}",
+        ]
+        aoa_config["aoa_statements"] += [
+            f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight"
+            for layer_id in range(config.text_config.num_hidden_layers)
+            for x in ("q", "k", "v")
+        ]
 
         # Qwen3VLMoeModel without lm_head
         if cls._tied_weights_keys:
@@ -1193,7 +1185,6 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: Optional[int] = None
         )
 
         self.sequence_parallel = config.sequence_parallel
-        self.fuse_attention_qkv = config.fuse_attention_qkv
         self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
 
         if config.tensor_model_parallel_size > 1:
@@ -1210,36 +1201,13 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: Optional[int] = None
         kv_hidden_size = self.config.num_key_value_heads * self.head_dim
         q_hidden_size = self.config.num_attention_heads * self.head_dim
 
-        if not self.fuse_attention_qkv:
-            self.q_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.k_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-            self.v_proj = GeneralLinear.create(
-                config.hidden_size,
-                kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
-        else:
-            self.qkv_proj = GeneralLinear.create(
-                config.hidden_size,
-                q_hidden_size + 2 * kv_hidden_size,
-                has_bias=config.attention_bias,
-                config=config,
-                tp_plan="colwise",
-            )
+        self.qkv_proj = GeneralLinear.create(
+            config.hidden_size,
+            q_hidden_size + 2 * kv_hidden_size,
+            has_bias=config.attention_bias,
+            config=config,
+            tp_plan="colwise",
+        )
         self.o_proj = GeneralLinear.create(
             q_hidden_size,
             config.hidden_size,
@@ -1262,46 +1230,29 @@ def forward(
         attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
         **kwargs,
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
-        if not self.fuse_attention_qkv:
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-            else:
-                bsz, q_len, _ = hidden_states.shape
-
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            query_states = query_states.reshape(bsz, q_len, -1, self.head_dim)
-            key_states = key_states.reshape(bsz, q_len, -1, self.head_dim)
-            value_states = value_states.reshape(bsz, q_len, -1, self.head_dim)
-
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.sequence_parallel:
+            max_sequence_length = self.config.max_sequence_length
+            bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
+            q_len = max_sequence_length
+            target_shape = [
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                (self.num_key_value_groups + 2) * self.head_dim,
+            ]
         else:
-            mix_layer = self.qkv_proj(hidden_states)
-            if self.sequence_parallel:
-                max_sequence_length = self.config.max_sequence_length
-                bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length
-                q_len = max_sequence_length
-                target_shape = [
-                    bsz,
-                    q_len,
-                    self.num_key_value_heads,
-                    (self.num_key_value_groups + 2) * self.head_dim,
-                ]
-            else:
-                target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
-            # mix_layer = mix_layer.reshape(target_shape)
-            mix_layer = paddle.reshape_(mix_layer, target_shape)
-            query_states, key_states, value_states = paddle.split(
-                mix_layer,
-                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
-                axis=-1,
-            )
-            if self.gqa_or_mqa:
-                # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
-                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+        # mix_layer = mix_layer.reshape(target_shape)
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        query_states, key_states, value_states = paddle.split(
+            mix_layer,
+            num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+            axis=-1,
+        )
+        if self.gqa_or_mqa:
+            # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim])
+            query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
 
         # apply qk_norm
         query_states = self.q_norm(query_states)
@@ -1356,7 +1307,7 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int):
         ):
             self.mlp = Qwen3VLMoeTextSparseMoeBlock(config)
         else:
-            self.mlp = Qwen3VLMoeTextMLP(config, fuse_up_gate=config.fuse_attention_ffn)
+            self.mlp = Qwen3VLMoeTextMLP(config, fuse_up_gate=True)
         self.input_layernorm = GeneralNorm.create(
             config=config,
             norm_type="rms_norm",
diff --git a/scripts/regression/test_dpo_tiny-random-glm4moe.py b/scripts/regression/test_dpo_tiny-random-glm4moe.py
index 344eca6be17..8b6029502fa 100644
--- a/scripts/regression/test_dpo_tiny-random-glm4moe.py
+++ b/scripts/regression/test_dpo_tiny-random-glm4moe.py
@@ -133,8 +133,6 @@ def test_dpo_full(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "full.yaml")
@@ -242,8 +240,6 @@ def test_dpo_full_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml")
@@ -295,8 +291,6 @@ def test_dpo_lora_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml")
diff --git a/scripts/regression/test_pt_tiny-random-glm4moe.py b/scripts/regression/test_pt_tiny-random-glm4moe.py
index dd707b9ea3b..b6a05289546 100644
--- a/scripts/regression/test_pt_tiny-random-glm4moe.py
+++ b/scripts/regression/test_pt_tiny-random-glm4moe.py
@@ -131,8 +131,6 @@ def test_pt_full(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
         }
         config_path = os.path.join(CONFIG_PATH, "full.yaml")
         updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args)
@@ -183,8 +181,6 @@ def test_pt_lora(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
         }
         config_path = os.path.join(CONFIG_PATH, "lora.yaml")
         updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args)
@@ -242,8 +238,6 @@ def test_pt_full_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
         }
         config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml")
         updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args)
@@ -293,8 +287,6 @@ def test_pt_lora_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
         }
         config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml")
         updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args)
diff --git a/scripts/regression/test_sft_tiny-random-glm4moe.py b/scripts/regression/test_sft_tiny-random-glm4moe.py
index face70c85fa..c42d7531cb5 100644
--- a/scripts/regression/test_sft_tiny-random-glm4moe.py
+++ b/scripts/regression/test_sft_tiny-random-glm4moe.py
@@ -132,8 +132,6 @@ def test_sft_full(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "full.yaml")
@@ -184,8 +182,6 @@ def test_sft_lora(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "lora.yaml")
@@ -244,8 +240,6 @@ def test_sft_full_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml")
@@ -296,8 +290,6 @@ def test_sft_lora_tp_pp(self):
             "output_dir": output_dir,
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml")
@@ -358,8 +350,6 @@ def test_sft_full_function_call(self):
             "max_steps": MAX_STEPS,
             "save_steps": SAVE_STEPS,
             "sharding": "stage1",
-            "fuse_attention_qkv": "true",
-            "fuse_attention_ffn": "true",
             "template": TEMPLATE,
         }
         config_path = os.path.join(CONFIG_PATH, "full_function_call.yaml")
diff --git a/tests/config/benchmark/config/pt/GLM4.5-Air.yaml b/tests/config/benchmark/config/pt/GLM4.5-Air.yaml
index eeeffc49e88..3a171c8ff6b 100644
--- a/tests/config/benchmark/config/pt/GLM4.5-Air.yaml
+++ b/tests/config/benchmark/config/pt/GLM4.5-Air.yaml
@@ -15,7 +15,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -66,7 +66,6 @@ using_sonic_moe: true
 
 moe_grouped_gemm: true
 apply_rope_fusion: true
-fuse_rms_norm: true
 fp32_residual_connection: false
 # moe_router_force_load_balancing: true
 
@@ -94,9 +93,6 @@ save_checkpoint_format: flex_checkpoint
 load_checkpoint_format: flex_checkpoint
 continue_training: true
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 tensorwise_offload_optimizer: true
 
 benchmark: true
diff --git a/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml b/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml
index a09207380df..f0b99da5bd0 100644
--- a/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml
+++ b/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml
@@ -15,7 +15,7 @@ padding_free: true
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -63,7 +63,6 @@ moe_grouped_gemm: true
 moe_deep_gemm: false
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_router_force_load_balancing: false
 
 sequence_parallel: true
@@ -98,7 +97,5 @@ benchmark: true
 dataloader_num_workers: 24
 prefetch_factor: 24
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fp32_residual_connection: false
 tensorwise_offload_optimizer: true
diff --git a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml
index 2ab9e71e9dc..2721cd5142a 100644
--- a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml
+++ b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml
@@ -22,7 +22,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/huangjiyi/Models/Qwen3-30B-A3B
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_qk_norm: true
 
 ### finetuning
@@ -75,7 +75,6 @@ split_param: true
 stage1_overlap: true
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_deep_gemm: true
 moe_grouped_gemm: true
 moe_router_fusion: true
@@ -98,9 +97,6 @@ amp_master_grad: true
 bf16: true
 fp16_opt_level: O2
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 save_checkpoint_format: "flex_checkpoint"
 load_checkpoint_format: "flex_checkpoint"
 
diff --git a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml
index 82c4ed72737..377efbc6045 100644
--- a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml
+++ b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml
@@ -24,7 +24,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: Qwen/Qwen3-30B-A3B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_qk_norm: true
 
 ### finetuning
@@ -75,7 +75,6 @@ stage1_overlap: true
 sd_release_grads: true
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_grouped_gemm: true
 moe_ep_barrier: false
 moe_router_fusion: true
@@ -94,9 +93,6 @@ bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 save_checkpoint_format: "flex_checkpoint"
 load_checkpoint_format: "flex_checkpoint"
 
diff --git a/tests/config/benchmark/config/sft/GLM4.5-Air.yaml b/tests/config/benchmark/config/sft/GLM4.5-Air.yaml
index dcb2c917180..d7c8fa7b2a8 100644
--- a/tests/config/benchmark/config/sft/GLM4.5-Air.yaml
+++ b/tests/config/benchmark/config/sft/GLM4.5-Air.yaml
@@ -15,7 +15,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -66,7 +66,6 @@ using_sonic_moe: true
 
 moe_grouped_gemm: true
 apply_rope_fusion: true
-fuse_rms_norm: true
 fp32_residual_connection: false
 # moe_router_force_load_balancing: true
 
@@ -94,9 +93,6 @@ save_checkpoint_format: flex_checkpoint
 load_checkpoint_format: flex_checkpoint
 continue_training: true
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 tensorwise_offload_optimizer: true
 
 benchmark: true
diff --git a/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml b/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml
index f37a5e55422..57e65d3a9ef 100644
--- a/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml
+++ b/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml
@@ -16,7 +16,7 @@ padding_free: true
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -64,7 +64,6 @@ moe_grouped_gemm: true
 moe_deep_gemm: false
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_router_force_load_balancing: false
 
 sequence_parallel: true
@@ -99,7 +98,5 @@ benchmark: true
 dataloader_num_workers: 24
 prefetch_factor: 24
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fp32_residual_connection: false
 tensorwise_offload_optimizer: true
diff --git a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml
index 42c680a14c4..5e41e5f3721 100644
--- a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml
+++ b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml
@@ -22,7 +22,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: /root/paddlejob/gpfs/huangjiyi/Models/Qwen3-30B-A3B
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_qk_norm: true
 
 ### finetuning
@@ -75,7 +75,6 @@ split_param: true
 stage1_overlap: true
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_deep_gemm: true
 moe_grouped_gemm: true
 moe_router_fusion: true
@@ -98,9 +97,6 @@ amp_master_grad: true
 bf16: true
 fp16_opt_level: O2
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 save_checkpoint_format: "flex_checkpoint"
 load_checkpoint_format: "flex_checkpoint"
 
diff --git a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml
index 4f65349d711..c6b571dfda8 100644
--- a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml
+++ b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml
@@ -24,7 +24,7 @@ prefetch_factor: 24
 
 ### model
 model_name_or_path: Qwen/Qwen3-30B-A3B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_qk_norm: true
 
 ### finetuning
@@ -75,7 +75,6 @@ stage1_overlap: true
 sd_release_grads: true
 
 apply_rope_fusion: true
-fuse_rms_norm: true
 moe_grouped_gemm: true
 moe_ep_barrier: false
 moe_router_fusion: true
@@ -95,9 +94,6 @@ bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
 
-fuse_attention_qkv: true
-fuse_attention_ffn: true
-
 save_checkpoint_format: "flex_checkpoint"
 load_checkpoint_format: "flex_checkpoint"
 
diff --git a/tests/config/ci/glm45_dpo.yaml b/tests/config/ci/glm45_dpo.yaml
index 0de6b4ba544..113424153e9 100644
--- a/tests/config/ci/glm45_dpo.yaml
+++ b/tests/config/ci/glm45_dpo.yaml
@@ -11,8 +11,8 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: zai-org/GLM-4.5-Air-Base/
-#attn_impl: sdpa
-attn_impl: flashmask
+#_attn_implementation: sdpa
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -66,7 +66,5 @@ moe_router_force_load_balancing: true
 clear_every_step_cache: true
 partial_send_recv: false
 #use_cpu_initialization: true
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 
 num_empty_layers_add_in_tail: 1
\ No newline at end of file
diff --git a/tests/config/ci/glm45_lora.yaml b/tests/config/ci/glm45_lora.yaml
index 5fc40e364f1..122e017e796 100644
--- a/tests/config/ci/glm45_lora.yaml
+++ b/tests/config/ci/glm45_lora.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ../zai-org/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -42,8 +42,6 @@ warmup_steps: 20
 learning_rate: 1.0e-4
 
 # performance
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 moe_token_dispatcher_type: "deepep"
 gated_linear_unit: true
 tensor_model_parallel_size: 4
diff --git a/tests/config/ci/glm45_pt.yaml b/tests/config/ci/glm45_pt.yaml
index 4c0b3f53665..de0e34c92a3 100644
--- a/tests/config/ci/glm45_pt.yaml
+++ b/tests/config/ci/glm45_pt.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -65,6 +65,4 @@ gated_linear_unit: true
 num_hidden_layers: 3
 apply_rope_fusion: true
 moe_router_fusion: true
-router_aux_loss_coef: 0.001
-fuse_attention_qkv: true
-fuse_attention_ffn: true
\ No newline at end of file
+router_aux_loss_coef: 0.001
\ No newline at end of file
diff --git a/tests/config/ci/glm45_pt_fp8.yaml b/tests/config/ci/glm45_pt_fp8.yaml
index 4779482fa61..11e2cb76066 100644
--- a/tests/config/ci/glm45_pt_fp8.yaml
+++ b/tests/config/ci/glm45_pt_fp8.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -66,6 +66,4 @@ num_hidden_layers: 3
 apply_rope_fusion: true
 moe_router_fusion: true
 router_aux_loss_coef: 0.001
-fp8: "e4m3"
-fuse_attention_qkv: true
-fuse_attention_ffn: true
\ No newline at end of file
+fp8: "e4m3"
\ No newline at end of file
diff --git a/tests/config/ci/glm45_pt_grouped_gemm.yaml b/tests/config/ci/glm45_pt_grouped_gemm.yaml
index c4cf1478b61..b0977f7314e 100644
--- a/tests/config/ci/glm45_pt_grouped_gemm.yaml
+++ b/tests/config/ci/glm45_pt_grouped_gemm.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -66,8 +66,6 @@ num_hidden_layers: 3
 apply_rope_fusion: true
 moe_router_fusion: true
 router_aux_loss_coef: 0.001
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 
 # grouped gemm
 moe_grouped_gemm: true
diff --git a/tests/config/ci/glm45_sft.yaml b/tests/config/ci/glm45_sft.yaml
index 066f15c2e94..7c8cd4939d6 100644
--- a/tests/config/ci/glm45_sft.yaml
+++ b/tests/config/ci/glm45_sft.yaml
@@ -11,7 +11,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./zai-org/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -42,8 +42,6 @@ warmup_steps: 20
 learning_rate: 1.0e-4
 
 # performance
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 moe_token_dispatcher_type: "deepep"
 gated_linear_unit: true
 tensor_model_parallel_size: 4
diff --git a/tests/config/ci/glm45_single_pt-test.yaml b/tests/config/ci/glm45_single_pt-test.yaml
index 0bb12fcac3b..a44009636ea 100644
--- a/tests/config/ci/glm45_single_pt-test.yaml
+++ b/tests/config/ci/glm45_single_pt-test.yaml
@@ -9,7 +9,7 @@ split: "998,1,1"
 
 ### modelv
 model_name_or_path: /home/.cache/glm45/GLM-4.5-Air
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
diff --git a/tests/config/ci/qwen3_multicard_lora.yaml b/tests/config/ci/qwen3_multicard_lora.yaml
index ebada0e2b99..474ec6350f8 100644
--- a/tests/config/ci/qwen3_multicard_lora.yaml
+++ b/tests/config/ci/qwen3_multicard_lora.yaml
@@ -17,7 +17,7 @@ mix_strategy: concat
 ### model
 model_name_or_path: ./checkpoints/qwen3-30b-a3b-sft
 gated_linear_unit: true
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -67,10 +67,7 @@ optim: adamw
 bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fuse_swiglu: true
-fuse_rms_norm: true
 fuse_linear: true
 use_paddlefleet: true
 use_qk_norm: true
diff --git a/tests/config/ci/qwen3_multicard_pt.yaml b/tests/config/ci/qwen3_multicard_pt.yaml
index f8b4509186b..aa91aff36bb 100644
--- a/tests/config/ci/qwen3_multicard_pt.yaml
+++ b/tests/config/ci/qwen3_multicard_pt.yaml
@@ -16,7 +16,7 @@ mix_strategy: concat
 ### model
 model_name_or_path: ./Qwen3-30B-A3B
 gated_linear_unit: true
-attn_impl: flashmask
+_attn_implementation: flashmask
 num_hidden_layers: 4
 
 ### finetuning
@@ -65,10 +65,7 @@ optim: adamw
 bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fuse_swiglu: true
-fuse_rms_norm: true
 fuse_linear: true
 use_paddlefleet: true
 use_qk_norm: true
diff --git a/tests/config/ci/qwen3_multicard_sft.yaml b/tests/config/ci/qwen3_multicard_sft.yaml
index 388c2b7cfab..ec5a8df0493 100644
--- a/tests/config/ci/qwen3_multicard_sft.yaml
+++ b/tests/config/ci/qwen3_multicard_sft.yaml
@@ -17,7 +17,7 @@ mix_strategy: concat
 ### model
 model_name_or_path: ./checkpoints/qwen3-30b-a3b-pt
 gated_linear_unit: true
-attn_impl: flashmask
+_attn_implementation: flashmask
 num_hidden_layers: 4
 
 ### finetuning
@@ -66,10 +66,7 @@ optim: adamw
 bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fuse_swiglu: true
-fuse_rms_norm: true
 fuse_linear: true
 use_paddlefleet: true
 use_qk_norm: true
diff --git a/tests/config/ci/qwen3_pt.yaml b/tests/config/ci/qwen3_pt.yaml
index ed92131055b..f50c8652947 100644
--- a/tests/config/ci/qwen3_pt.yaml
+++ b/tests/config/ci/qwen3_pt.yaml
@@ -12,7 +12,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./Qwen3-30B-A3B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -53,8 +53,6 @@ optim: adamw
 bf16: true
 fp16_opt_level: O2
 amp_master_grad: true
-fuse_attention_qkv: true
-fuse_attention_ffn: true
 fuse_swiglu: true
 use_qk_norm: true
 
diff --git a/tests/mergekit/test_merge_model.py b/tests/mergekit/test_merge_model.py
index e71cb2aa171..3db7acbd8e3 100644
--- a/tests/mergekit/test_merge_model.py
+++ b/tests/mergekit/test_merge_model.py
@@ -145,8 +145,6 @@ def test_fuse_qkv_lora_merge_torch(self):
             from paddleformers.transformers import Qwen3Config, Qwen3ForCausalLM
 
             model_config = Qwen3Config.from_pretrained(torch_model_path)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
             fused_base_model = Qwen3ForCausalLM.from_pretrained(
                 torch_model_path,
                 config=model_config,
diff --git a/tests/peft/test_lora.py b/tests/peft/test_lora.py
index a7679e6b2b0..c0b1cc78922 100644
--- a/tests/peft/test_lora.py
+++ b/tests/peft/test_lora.py
@@ -87,7 +87,7 @@ def test_load_regular_linear(self):
 class TestLoraModel(unittest.TestCase):
     def test_lora_model_restore(self):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -109,7 +109,7 @@ def test_lora_model_restore(self):
     @parameterized.expand([(None,), ("all",), ("lora",)])
     def test_lora_model_constructor(self, bias):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -149,7 +149,7 @@ def test_lora_model_save_load(self):
         with TemporaryDirectory() as tempdir:
             input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20]))
             lora_config = LoRAConfig(
-                target_modules=[".*q_proj.*", ".*v_proj.*"],
+                target_modules=[".*qkv_proj.*"],
                 r=4,
                 lora_alpha=8,
             )
@@ -182,7 +182,7 @@ def test_lora_module_raise_exception(self):
             LoRAModel(model, lora_config)
 
     def test_lora_get_merge_state_dict(self):
-        lora_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8)
+        lora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8)
         model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True)
         model.eval()
         lora_model = LoRAModel(model, lora_config)
@@ -201,7 +201,7 @@ def test_lora_get_merge_state_dict(self):
 
             self.assertIsInstance(merged_weight, paddle.Tensor)
 
-            if any(target in k for target in ["q_proj", "v_proj"]):
+            if any(target in k for target in ["qkv_proj"]):
                 lora_A_key = k.replace("weight", "lora_A")
                 lora_B_key = k.replace("weight", "lora_B")
 
@@ -230,12 +230,15 @@ def test_lora_model_save_load_fc(self):
         with TemporaryDirectory() as tempdir:
             input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
             lora_config = LoRAConfig(
-                target_modules=[".*q_proj.*", ".*v_proj.*"],
+                target_modules=[".*qkv_proj.*"],
                 r=4,
                 lora_alpha=8,
             )
             model = Glm4MoeModel.from_pretrained(
-                "PaddleFormers/tiny-random-glm4moe", download_hub="aistudio", convert_from_hf=True
+                "PaddleFormers/tiny-random-glm4moe-bf16",
+                download_hub="aistudio",
+                convert_from_hf=True,
+                dtype="float32",
             )
             lora_model = LoRAModel(model, lora_config)
             lora_model.eval()
diff --git a/tests/peft/test_lorapro.py b/tests/peft/test_lorapro.py
index 9dc5e2103a8..13fc110be49 100644
--- a/tests/peft/test_lorapro.py
+++ b/tests/peft/test_lorapro.py
@@ -93,7 +93,7 @@ def tearDown(self):
 
     def test_lorapro_model_restore(self):
         lorapro_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -116,7 +116,7 @@ def test_lorapro_model_restore(self):
     @parameterized.expand([(None,), ("all",), ("lora",)])
     def test_lorapro_model_constructor(self, bias):
         lorapro_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -156,7 +156,7 @@ def test_lorapro_model_constructor(self, bias):
     def test_lorapro_model_save_load(self):
         with TemporaryDirectory() as tempdir:
             input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20]))
-            lorapro_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lorapro=True)
+            lorapro_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, lorapro=True)
             model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True)
             lorapro_model = LoRAModel(model, lorapro_config)
             lorapro_model.eval()
@@ -177,7 +177,7 @@ def test_lorapro_model_save_load(self):
     def test_lorapro_modes(self, x_mode):
         """Test if AdamWLoRAPro optimizer with different x_modes can perform optimization steps"""
         lorapro_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
diff --git a/tests/peft/test_mora.py b/tests/peft/test_mora.py
index 9324d2f4913..0b53d038db4 100644
--- a/tests/peft/test_mora.py
+++ b/tests/peft/test_mora.py
@@ -94,7 +94,7 @@ def test_unmerge(self):
 class TestMoraModel(unittest.TestCase):
     def test_mora_model_restore(self):
         mora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -117,7 +117,7 @@ def test_mora_model_restore(self):
     @parameterized.expand([(None,), ("all",), ("lora",)])
     def test_mora_model_constructor(self, bias):
         mora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -157,7 +157,7 @@ def test_mora_model_constructor(self, bias):
     def test_mora_model_save_load(self):
         with TemporaryDirectory() as tempdir:
             input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20]))
-            mora_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, use_mora=True)
+            mora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, use_mora=True)
             model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True)
             mora_model = LoRAModel(model, mora_config)
             mora_model.eval()
diff --git a/tests/peft/test_mos_lora.py b/tests/peft/test_mos_lora.py
index 929926959ec..e50f80f3cc2 100644
--- a/tests/peft/test_mos_lora.py
+++ b/tests/peft/test_mos_lora.py
@@ -97,7 +97,7 @@ def test_unmerge(self):
 class TestMosLoraModel(unittest.TestCase):
     def test_lora_model_restore(self):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -119,7 +119,7 @@ def test_lora_model_restore(self):
 
     def test_parallel_support(self):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -135,7 +135,7 @@ def test_parallel_support(self):
     @parameterized.expand([(None,), ("all",), ("lora",)])
     def test_lora_model_constructor(self, bias):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
             enable_lora_list=[None, [True, False]],
@@ -175,9 +175,7 @@ def test_lora_model_constructor(self, bias):
     def test_lora_model_save_load(self):
         with TemporaryDirectory() as tempdir:
             input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20]))
-            lora_config = LoRAConfig(
-                target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lora_use_mixer=True
-            )
+            lora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, lora_use_mixer=True)
             model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True)
             lora_model = LoRAModel(model, lora_config)
             lora_model.eval()
diff --git a/tests/peft/test_quant_lora.py b/tests/peft/test_quant_lora.py
index db2ef08a3e2..bfbae40d538 100644
--- a/tests/peft/test_quant_lora.py
+++ b/tests/peft/test_quant_lora.py
@@ -98,7 +98,7 @@ class TestQuantedLoRAModel(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         lora_config = LoRAConfig(
-            target_modules=[".*q_proj.*", ".*v_proj.*"],
+            target_modules=[".*qkv_proj.*"],
             r=4,
             lora_alpha=8,
         )
@@ -128,8 +128,8 @@ def test_count_model_layers(self):
         self.lora_model.train()
         quant_lora_model = qat.quantize(self.lora_model, inplace=False)
         quantizer_cnt = self._count_layers(quant_lora_model, FakeQuanterWithAbsMaxObserverLayer)
-        # 2 LoRA layers (q_proj, v_proj) per transformer layer
-        self.assertEqual(quantizer_cnt, 2 * self.model.config.num_hidden_layers)
+        # 2 LoRA layers (qkv_proj) per transformer layer
+        self.assertEqual(quantizer_cnt, self.model.config.num_hidden_layers)
 
     def test_forward_no_quant(self):
         q_config = QuantConfig(activation=None, weight=None)
diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py
index 12b63aae806..59c0033f657 100644
--- a/tests/trainer/test_unified_checkpoint.py
+++ b/tests/trainer/test_unified_checkpoint.py
@@ -66,7 +66,6 @@
     "sharding": "",
     "virtual_pipeline_model_parallel_size": 1,
     "sequence_parallel": 0,
-    "fuse_rms_norm": "false",
     "max_seq_len": 1024,
     "learning_rate": 3e-04,
     "min_learning_rate": 1e-05,
diff --git a/tests/transformers/auto/test_configuration.py b/tests/transformers/auto/test_configuration.py
index a2326b8be22..522a6a35ba2 100644
--- a/tests/transformers/auto/test_configuration.py
+++ b/tests/transformers/auto/test_configuration.py
@@ -124,8 +124,6 @@ def test_load_from_custom_arch(self):
             "bos_token_id": 1,
             "do_normalize": False,
             "eos_token_id": 2,
-            "fuse_attention_ffn": False,
-            "fuse_attention_qkv": False,
             "fuse_sequence_parallel_allreduce": False,
             "hidden_act": "silu",
             "hidden_size": 4096,
@@ -151,7 +149,6 @@ def test_load_from_custom_arch(self):
             "tensor_parallel_output": True,
             "tie_word_embeddings": False,
             "transformers_version": "4.28.1",
-            "fuse_rms_norm": False,
             "apply_rope_fusion": False,
             "use_recompute": False,
             "virtual_pipeline_model_parallel_size": 1,
diff --git a/tests/transformers/deepseek_v3/test_modeling.py b/tests/transformers/deepseek_v3/test_modeling.py
index a3139d06c11..ce41a1fc499 100644
--- a/tests/transformers/deepseek_v3/test_modeling.py
+++ b/tests/transformers/deepseek_v3/test_modeling.py
@@ -72,7 +72,7 @@ def __init__(
         num_labels=3,
         num_choices=4,
         pad_token_id=0,
-        aux_loss_alpha=0.001,
+        router_aux_loss_coef=0.001,
         first_k_dense_replace=1,
         hidden_act="silu",
         scope=None,
@@ -104,7 +104,7 @@ def __init__(
         self.num_experts_per_tok = num_experts_per_tok
         self.first_k_dense_replace = first_k_dense_replace
         self.norm_topk_prob = norm_topk_prob
-        self.aux_loss_alpha = aux_loss_alpha
+        self.router_aux_loss_coef = router_aux_loss_coef
         self.hidden_act = hidden_act
         self.max_position_embeddings = max_position_embeddings
         self.initializer_range = initializer_range
@@ -166,7 +166,7 @@ def get_config(self) -> DeepseekV3Config:
             num_experts_per_tok=self.num_experts_per_tok,
             first_k_dense_replace=self.first_k_dense_replace,
             norm_topk_prob=self.norm_topk_prob,
-            aux_loss_alpha=self.aux_loss_alpha,
+            router_aux_loss_coef=self.router_aux_loss_coef,
             hidden_act=self.hidden_act,
             max_position_embeddings=self.max_position_embeddings,
             initializer_range=self.initializer_range,
diff --git a/tests/transformers/ernie4_5/test_modeling.py b/tests/transformers/ernie4_5/test_modeling.py
index 6d3c1585cd3..7b14020fe2a 100644
--- a/tests/transformers/ernie4_5/test_modeling.py
+++ b/tests/transformers/ernie4_5/test_modeling.py
@@ -518,49 +518,3 @@ def test_ernie4_5_converter_from_local_dir(self):
                     rtol=1e-2,
                 )
             )
-
-            # 4. forward with fc
-            from paddleformers.transformers import Ernie4_5Config, Ernie4_5ForCausalLM
-
-            uc_load_model = Ernie4_5ForCausalLM.from_pretrained(
-                self.torch_model_path,
-                convert_from_hf=True,
-                dtype="float32",
-                load_checkpoint_format="",
-            )
-            fc_load_model = Ernie4_5ForCausalLM.from_pretrained(
-                self.torch_model_path, dtype="float32", load_checkpoint_format="flex_checkpoint"
-            )
-            uc_load_model.eval()
-            fc_load_model.eval()
-            uc_logit = uc_load_model(paddle.to_tensor(input_ids))[0]
-            fc_logit = fc_load_model(paddle.to_tensor(input_ids))[0]
-            self.assertTrue(
-                np.allclose(
-                    uc_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(),
-                    fc_logit.detach().cpu().reshape([-1])[:9].float().numpy(),
-                    atol=1e-5,
-                    rtol=1e-5,
-                )
-            )
-
-            # 5. fuse qkv/ffn with fc
-            model_config = Ernie4_5Config.from_pretrained(self.torch_model_path)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
-            fc_fused_load_model = Ernie4_5ForCausalLM.from_pretrained(
-                self.torch_model_path,
-                config=model_config,
-                dtype="float32",
-                load_checkpoint_format="flex_checkpoint",
-            )
-            fc_fused_load_model.eval()
-            fc_fused_logit = fc_fused_load_model(paddle.to_tensor(input_ids))[0]
-            self.assertTrue(
-                np.allclose(
-                    fc_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(),
-                    fc_fused_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(),
-                    atol=1e-5,
-                    rtol=1e-5,
-                )
-            )
diff --git a/tests/transformers/gemma3_text/test_modeling.py b/tests/transformers/gemma3_text/test_modeling.py
index 5d8735b908b..b774252e49c 100644
--- a/tests/transformers/gemma3_text/test_modeling.py
+++ b/tests/transformers/gemma3_text/test_modeling.py
@@ -324,8 +324,6 @@ def create_and_check_tp(self, config, input_ids, input_mask, *args):
             Gemma3ForCausalLM(config)
 
     def create_and_check_fuse_attn(self, config, input_ids, input_mask, *args):
-        config.fuse_attention_qkv = True
-        config.fuse_attention_ffn = True
         model = Gemma3ForCausalLM(config)
         model.eval()
 
diff --git a/tests/transformers/glm4_moe/test_modeling.py b/tests/transformers/glm4_moe/test_modeling.py
index 037b592eb66..8baa856a11f 100644
--- a/tests/transformers/glm4_moe/test_modeling.py
+++ b/tests/transformers/glm4_moe/test_modeling.py
@@ -378,37 +378,27 @@ def test_save_load(self):
         for model_class in self.all_model_classes:
             # test from_pretrained
             model1 = model_class.from_pretrained(
-                "PaddleFormers/tiny-random-glm4moe",
+                "PaddleFormers/tiny-random-glm4moe-bf16",
                 download_hub="aistudio",
-                convert_from_hf=True,
-                load_checkpoint_format="",
+                load_checkpoint_format="flex_checkpoint",
             )
-
-            model2 = model_class.from_pretrained(
-                "PaddleFormers/tiny-random-glm4moe", download_hub="aistudio", load_checkpoint_format="flex_checkpoint"
-            )
-
             model_state_1 = model1.state_dict()
-            model_state_2 = model2.state_dict()
-
-            for k, v in model_state_1.items():
-                md51 = v._md5sum()
-                md52 = model_state_2[k]._md5sum()
-                assert md51 == md52
 
             # test save_pretrained
             with tempfile.TemporaryDirectory() as tmpdirname:
-                model2.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint")
-                model3 = model_class.from_pretrained(tmpdirname, convert_from_hf=True, load_checkpoint_format="")
-                model_state_3 = model3.state_dict()
+                model1.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint")
+                model2 = model_class.from_pretrained(
+                    tmpdirname, convert_from_hf=True, load_checkpoint_format="flex_checkpoint"
+                )
+                model_state_2 = model2.state_dict()
 
-                for k, v in model_state_3.items():
-                    md53 = v._md5sum()
-                    md52 = model_state_2[k]._md5sum()
+                for k, v in model_state_2.items():
+                    md52 = v._md5sum()
+                    md51 = model_state_1[k]._md5sum()
                     if k.endswith(".mlp.gate.weight"):
+                        md51 = model_state_1[k].cast("bfloat16")._md5sum()
                         md52 = model_state_2[k].cast("bfloat16")._md5sum()
-                        md53 = model_state_3[k].cast("bfloat16")._md5sum()
-                    assert md52 == md53
+                    assert md51 == md52
 
     def test_hidden_states_output(self):
         pass
diff --git a/tests/transformers/qwen2/test_modeling.py b/tests/transformers/qwen2/test_modeling.py
index 601eef63014..3531a17276c 100644
--- a/tests/transformers/qwen2/test_modeling.py
+++ b/tests/transformers/qwen2/test_modeling.py
@@ -480,8 +480,6 @@ def test_Qwen2_converter_from_local_dir(self):
 
             # 4. fuse qkv/ffn with fc
             model_config = Qwen2Config.from_pretrained(tempdir)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
             paddle_model_fused = Qwen2ForCausalLM.from_pretrained(
                 tempdir,
                 config=model_config,
diff --git a/tests/transformers/qwen2_5_vl/test_modeling.py b/tests/transformers/qwen2_5_vl/test_modeling.py
index 27bfcd87985..11f31754019 100644
--- a/tests/transformers/qwen2_5_vl/test_modeling.py
+++ b/tests/transformers/qwen2_5_vl/test_modeling.py
@@ -500,38 +500,6 @@ def test_sample_generate(self):
             else:
                 self.assertTrue(output_generate[0].shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1])
 
-    def test_save_load_flex_checkpoint(self):
-        for model_class in self.all_model_classes:
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tiny_vision_config = {
-                    "depth": 4,
-                    "intermediate_size": 64,
-                    "hidden_size": 64,
-                    "out_hidden_size": 128,
-                    "fullatt_block_indexes": [1],
-                }
-                config = Qwen2_5_VLConfig(
-                    num_hidden_layers=4,
-                    intermediate_size=256,
-                    hidden_size=128,
-                    tie_word_embedding=False,
-                    vision_config=tiny_vision_config,
-                )
-                model = model_class(config)
-                model.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint")
-
-                model1 = model_class.from_pretrained(tmpdirname, convert_from_hf=True, load_checkpoint_format="")
-
-                model2 = model_class.from_pretrained(tmpdirname, load_checkpoint_format="flex_checkpoint")
-
-                model_state_1 = model1.state_dict()
-                model_state_2 = model2.state_dict()
-
-                for k, v in model_state_1.items():
-                    md51 = v._md5sum()
-                    md52 = model_state_2[k]._md5sum()
-                    assert md51 == md52
-
 
 class Qwen2_5_VLIntegrationTest(unittest.TestCase):
     def setUp(self):
@@ -956,8 +924,6 @@ def test_Qwen2_5_VL_classes_from_local_dir(self, class_name, pytorch_class_name:
             paddle_model_fused = paddle_model_class.from_pretrained(
                 tempdir,
                 dtype="float32",
-                fuse_attention_qkv=True,
-                fuse_attention_ffn=True,
                 load_checkpoint_format="flex_checkpoint",
             ).eval()
 
diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py
index 23692865605..b19f6720ff0 100644
--- a/tests/transformers/qwen2moe/test_modeling.py
+++ b/tests/transformers/qwen2moe/test_modeling.py
@@ -442,8 +442,6 @@ def test_Qwen2Moe_converter_from_local_dir(self):
 
             # 4. fuse qkv/ffn with fc
             model_config = Qwen2MoeConfig.from_pretrained(tempdir)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
             paddle_model_fused = Qwen2MoeForCausalLM.from_pretrained(
                 tempdir,
                 config=model_config,
diff --git a/tests/transformers/qwen3/test_modeling.py b/tests/transformers/qwen3/test_modeling.py
index a644458063c..4c034460375 100644
--- a/tests/transformers/qwen3/test_modeling.py
+++ b/tests/transformers/qwen3/test_modeling.py
@@ -484,8 +484,6 @@ def test_Qwen3_converter_from_local_dir(self):
 
             # 4. fuse qkv/ffn with fc
             model_config = Qwen3Config.from_pretrained(tempdir)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
             paddle_model_fused = Qwen3ForCausalLM.from_pretrained(
                 tempdir,
                 config=model_config,
diff --git a/tests/transformers/qwen3_vl/test_modeling.py b/tests/transformers/qwen3_vl/test_modeling.py
index b61e3a3b446..bca227b1aee 100644
--- a/tests/transformers/qwen3_vl/test_modeling.py
+++ b/tests/transformers/qwen3_vl/test_modeling.py
@@ -1003,8 +1003,6 @@ def test_Qwen3VL_classes_from_local_dir(self, class_name, pytorch_class_name: st
             paddle_model_fused = paddle_model_class.from_pretrained(
                 tempdir,
                 dtype="float32",
-                fuse_attention_qkv=True,
-                fuse_attention_ffn=True,
                 load_checkpoint_format="flex_checkpoint",
             ).eval()
 
diff --git a/tests/transformers/qwen3_vl_moe/test_modeling.py b/tests/transformers/qwen3_vl_moe/test_modeling.py
index 0b97867ad7b..339d4c0a19e 100644
--- a/tests/transformers/qwen3_vl_moe/test_modeling.py
+++ b/tests/transformers/qwen3_vl_moe/test_modeling.py
@@ -1040,8 +1040,6 @@ def test_Qwen3VLMoe_classes_from_local_dir(self, class_name, pytorch_class_name:
             paddle_model_fused = paddle_model_class.from_pretrained(
                 tempdir,
                 dtype="float32",
-                fuse_attention_qkv=True,
-                fuse_attention_ffn=True,
                 load_checkpoint_format="flex_checkpoint",
             ).eval()
 
diff --git a/tests/transformers/qwen3moe/test_modeling.py b/tests/transformers/qwen3moe/test_modeling.py
index 17a5a55263a..6c897d1fb6c 100644
--- a/tests/transformers/qwen3moe/test_modeling.py
+++ b/tests/transformers/qwen3moe/test_modeling.py
@@ -444,8 +444,6 @@ def test_Qwen3Moe_converter_from_local_dir(self):
 
             # 4. fuse qkv/ffn with fc
             model_config = Qwen3MoeConfig.from_pretrained(tempdir)
-            model_config.fuse_attention_qkv = True
-            model_config.fuse_attention_ffn = True
             paddle_model_fused = Qwen3MoeForCausalLM.from_pretrained(
                 tempdir,
                 config=model_config,
diff --git a/tests/transformers/test_configuration_utils.py b/tests/transformers/test_configuration_utils.py
index d8f3bebd048..fc0341932ed 100644
--- a/tests/transformers/test_configuration_utils.py
+++ b/tests/transformers/test_configuration_utils.py
@@ -89,8 +89,6 @@ def test_parse_config_with_single_config(self):
     def test_model_config_save(self):
         # 1. single config
         config = FakeSimplePretrainedModelConfig(a=10, b=11, c=12)
-        config.fuse_attention_qkv = True
-        config.fuse_rms_norm = True
         config.tensor_model_parallel_size = 8
         config.tensor_parallel_output = True
 
@@ -107,8 +105,6 @@ def test_model_config_save(self):
             import json
 
             loaded_config = json.load(open(os.path.join(tp, "config.json"), "r"))
-            assert "fuse_attention_qkv" in loaded_config, "fuse qkv is need to save"
-            assert "fuse_rms_norm" not in loaded_config, "fuse_rms_norm don't need to save"
             assert "tensor_model_parallel_size" in loaded_config, "tensor_model_parallel_size need to save"
             assert "paddleformers_version" in loaded_config, "always save paddleformers_version"
             assert (
diff --git a/tests/transformers/test_conversion_common.py b/tests/transformers/test_conversion_common.py
index 018e018ccf6..18c9ce35ad1 100644
--- a/tests/transformers/test_conversion_common.py
+++ b/tests/transformers/test_conversion_common.py
@@ -17,7 +17,6 @@
 import glob
 import os
 import tempfile
-import unittest
 
 import paddle
 
@@ -236,17 +235,3 @@ def forward(self, input_ids):
     config_fast_ffn.convert_fast_ffn = True
 
     common_test_save_and_load(config_no_fast_ffn, config_fast_ffn, TestForCausalLM)
-
-
-from paddleformers.transformers import LlamaConfig, LlamaForCausalLM
-
-
-class TestFuseOrSplit(unittest.TestCase):
-    def test_model_split_to_fuse(self):
-        _test_split_to_fuse(LlamaConfig, LlamaForCausalLM)
-
-    def test_model_fuse_to_split(self):
-        _test_fuse_to_split(LlamaConfig, LlamaForCausalLM)
-
-    def test_model_convert_fast_ffn(self):
-        _test_fast_ffn()
diff --git a/tests/transformers/test_shard_checkpoint.py b/tests/transformers/test_shard_checkpoint.py
deleted file mode 100644
index e09bfbe2ad1..00000000000
--- a/tests/transformers/test_shard_checkpoint.py
+++ /dev/null
@@ -1,486 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import tempfile
-import unittest
-
-import paddle
-
-from paddleformers.transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    LlamaModel,
-    PretrainedConfig,
-    PretrainedModel,
-    Qwen3Model,
-    register_base_model,
-)
-from paddleformers.transformers.model_utils import (
-    load_sharded_checkpoint,
-    shard_checkpoint,
-)
-from paddleformers.utils.env import (
-    PADDLE_WEIGHTS_INDEX_NAME,
-    PADDLE_WEIGHTS_NAME,
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-)
-from paddleformers.utils.import_utils import is_paddle_cuda_available
-from tests.testing_utils import require_package
-
-
-class FakeConfig(PretrainedConfig):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-
-class FakePretrainedModel(PretrainedModel):
-    config_class = FakeConfig
-
-    _keep_in_fp32_modules = ["norm."]
-
-
-@register_base_model
-class FakeModel(FakePretrainedModel):
-    def __init__(self, config):
-        super(FakeModel, self).__init__(config)
-        self.linear = paddle.nn.Linear(2, 3)
-        self.norm = paddle.nn.LayerNorm(2)
-
-
-class TestFromPretrained(unittest.TestCase):
-    def test_from_pretrained_low_cpu_mem_usage_functional(self):
-        # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and
-        # sharded models
-        mnames = [
-            "Paddleformers/tiny-random-llama3-shard",
-            "Paddleformers/tiny-random-llama3",
-        ]
-        convert_from_hf = [False, True]
-        for mname, convert in zip(mnames, convert_from_hf):
-            m1 = LlamaModel.from_pretrained(
-                mname,
-                low_cpu_mem_usage=True,
-                convert_from_hf=convert,
-                load_checkpoint_format="",
-            )
-            m2 = LlamaModel.from_pretrained(
-                mname,
-                low_cpu_mem_usage=False,
-                convert_from_hf=convert,
-                load_checkpoint_format="",
-            )
-            for p1, p2 in zip(m1.parameters(), m2.parameters()):
-                self.assertTrue(paddle.allclose(p1.float(), p2.float()))
-
-    @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode")
-    def test_keep_in_fp32_modules(self):
-        with tempfile.TemporaryDirectory() as tempdir:
-            config = PretrainedConfig()
-            model = FakeModel.from_config(config, dtype="float16")
-            model.config = config
-            model.save_pretrained(tempdir, save_to_hf=False, save_checkpoint_format="")
-
-            # check model_state.pdparams
-            state_dict = paddle.load(os.path.join(tempdir, "model_state.pdparams"))
-
-            self.assertEqual(state_dict["linear.weight"].dtype, paddle.float16)
-            self.assertEqual(state_dict["norm.weight"].dtype, paddle.float16)
-
-            new_model = FakeModel.from_pretrained(tempdir, convert_from_hf=False, load_checkpoint_format="")
-            self.assertEqual(new_model.linear.weight.dtype, paddle.float16)
-            self.assertEqual(new_model.norm.weight.dtype, paddle.float32)
-
-    def test_load_sharded_checkpoint(self):
-        config = AutoConfig.from_pretrained("Paddleformers/tiny-random-llama3-shard")
-        model = LlamaModel.from_pretrained(
-            "Paddleformers/tiny-random-llama3-shard",
-            convert_from_hf=False,
-            load_checkpoint_format="",
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, max_shard_size="200kiB", save_to_hf=False, save_checkpoint_format="")
-            model_load = LlamaModel.from_config(config)
-            missing_keys, unexpected_keys = load_sharded_checkpoint(model_load, tmp_dir)
-
-        self.assertEqual(missing_keys, [])
-        self.assertEqual(unexpected_keys, [])
-        for p1, p2 in zip(model.parameters(), model_load.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode")
-    def test_load_from_torch_dtyp_cast(self):
-        pass
-
-    @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode")
-    def test_load_dtype_cast(self):
-        dtype_prefix_len = len("paddle.")
-
-        def inner_convert_test(src_dtype, dst_dtype):
-            str_src_dtype = str(src_dtype)[dtype_prefix_len:]
-            str_dst_dtype = str(dst_dtype)[dtype_prefix_len:]
-
-            config = AutoConfig.from_pretrained("PaddleFormers/tiny-random-qwen3")
-            model = Qwen3Model.from_config(config, dtype=str_src_dtype)
-
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                model.save_pretrained(tmp_dir, save_to_hf=False, save_checkpoint_format="")
-                new_model = Qwen3Model.from_pretrained(
-                    tmp_dir, dtype=str_dst_dtype, convert_from_hf=False, load_checkpoint_format=""
-                )
-
-            for k, v in model.state_dict().items():
-                if v.is_floating_point():
-                    self.assertEqual(v.dtype, src_dtype)
-            for k, v in new_model.state_dict().items():
-                if v.is_floating_point():
-                    self.assertEqual(v.dtype, dst_dtype)
-
-        with self.subTest("paddle.float32 to paddle.float16"):
-            inner_convert_test(paddle.float32, paddle.float16)
-        with self.subTest("paddle.float32 to paddle.bfloat16"):
-            inner_convert_test(paddle.float32, paddle.bfloat16)
-        with self.subTest("paddle.float16 to paddle.float32"):
-            inner_convert_test(paddle.float16, paddle.float32)
-        with self.subTest("paddle.float16 to paddle.bfloat16"):
-            inner_convert_test(paddle.float16, paddle.bfloat16)
-        with self.subTest("paddle.bfloat16 to paddle.float32"):
-            inner_convert_test(paddle.bfloat16, paddle.float32)
-        with self.subTest("paddle.bfloat16 to paddle.float16"):
-            inner_convert_test(paddle.bfloat16, paddle.float16)
-
-
-class TestShardCheckpoint(unittest.TestCase):
-    def test_shard_checkpoint(self):
-        # This is the model we will use, total size 340,000 bytes.
-        model = paddle.nn.Sequential(
-            paddle.nn.Linear(100, 200, bias_attr=False),  # size 80,000
-            paddle.nn.Linear(200, 200, bias_attr=False),  # size 160,000
-            paddle.nn.Linear(200, 100, bias_attr=False),  # size 80,000
-            paddle.nn.Linear(100, 50, bias_attr=False),  # size 20,000
-        )
-        state_dict = model.state_dict()
-
-        with self.subTest("No shard when max size is bigger than model size"):
-            shards, index = shard_checkpoint(state_dict)
-            self.assertIsNone(index)
-            self.assertDictEqual(shards, {PADDLE_WEIGHTS_NAME: state_dict})
-
-        with self.subTest("Test sharding, no weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="300kB")
-            # Split is first two layers then last two.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "model_state-00001-of-00002.pdparams",
-                        "1.weight": "model_state-00001-of-00002.pdparams",
-                        "2.weight": "model_state-00002-of-00002.pdparams",
-                        "3.weight": "model_state-00002-of-00002.pdparams",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]}
-            shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards, {"model_state-00001-of-00002.pdparams": shard1, "model_state-00002-of-00002.pdparams": shard2}
-            )
-
-        with self.subTest("Test sharding with weights bigger than max size"):
-            shards, index = shard_checkpoint(state_dict, max_shard_size="100kB")
-            # Split is first layer, second layer then last 2.
-            self.assertDictEqual(
-                index,
-                {
-                    "metadata": {"total_size": 340000},
-                    "weight_map": {
-                        "0.weight": "model_state-00001-of-00003.pdparams",
-                        "1.weight": "model_state-00002-of-00003.pdparams",
-                        "2.weight": "model_state-00003-of-00003.pdparams",
-                        "3.weight": "model_state-00003-of-00003.pdparams",
-                    },
-                },
-            )
-
-            shard1 = {"0.weight": state_dict["0.weight"]}
-            shard2 = {"1.weight": state_dict["1.weight"]}
-            shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]}
-            self.assertDictEqual(
-                shards,
-                {
-                    "model_state-00001-of-00003.pdparams": shard1,
-                    "model_state-00002-of-00003.pdparams": shard2,
-                    "model_state-00003-of-00003.pdparams": shard3,
-                },
-            )
-
-    def test_checkpoint_sharding_local(self):
-        model = LlamaModel.from_pretrained(
-            "Paddleformers/tiny-random-llama3-shard",
-            convert_from_hf=False,
-            load_checkpoint_format="",
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # We use the same folder for various sizes to make sure a new save erases the old checkpoint.
-            for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]:
-                model.save_pretrained(tmp_dir, max_shard_size=max_size, save_to_hf=False, save_checkpoint_format="")
-
-                # Get each shard file and its size
-                shard_to_size = {}
-                for shard in os.listdir(tmp_dir):
-                    if shard.endswith(".pdparams"):
-                        shard_file = os.path.join(tmp_dir, shard)
-                        shard_to_size[shard_file] = os.path.getsize(shard_file)
-
-                index_file = os.path.join(tmp_dir, PADDLE_WEIGHTS_INDEX_NAME)
-                # Check there is an index but no regular weight file
-                self.assertTrue(os.path.isfile(index_file))
-                self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME)))
-
-                # Check a file is bigger than max_size only when it has a single weight
-                for shard_file, size in shard_to_size.items():
-                    if max_size.endswith("kiB"):
-                        max_size_int = int(max_size[:-3]) * 2**10
-                    else:
-                        max_size_int = int(max_size[:-2]) * 10**3
-                    # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than
-                    # the size asked for (since we count parameters)
-                    if size >= max_size_int + 50000:
-                        state_dict = paddle.load(shard_file)
-                        self.assertEqual(len(state_dict), 1)
-
-                # Check the index and the shard files found match
-                with open(index_file, "r", encoding="utf-8") as f:
-                    index = json.loads(f.read())
-
-                all_shards = set(index["weight_map"].values())
-                shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".pdparams")}
-                self.assertSetEqual(all_shards, shards_found)
-
-                # Finally, check the model can be reloaded
-                new_model = LlamaModel.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="")
-                for p1, p2 in zip(model.parameters(), new_model.parameters()):
-                    self.assertTrue(paddle.allclose(p1, p2))
-
-    def test_checkpoint_sharding_from_hub(self):
-        model = LlamaModel.from_pretrained(
-            "Paddleformers/tiny-random-llama3-shard",
-            convert_from_hf=False,
-            load_checkpoint_format="",
-        )
-
-        # the model above is the same as the model below, just a sharded version.
-        ref_model = LlamaModel.from_pretrained(
-            "Paddleformers/tiny-random-llama3-shard",
-            convert_from_hf=False,
-            load_checkpoint_format="",
-        )
-        for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    def test_checkpoint_variant_local(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format=""
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(tmp_dir, variant="v2", save_to_hf=False, save_checkpoint_format="")
-
-            weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="")
-
-            new_model = Qwen3Model.from_pretrained(
-                tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format=""
-            )
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    def test_checkpoint_variant_local_sharded(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format=""
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, variant="v2", max_shard_size="50kB", save_to_hf=False, save_checkpoint_format=""
-            )
-
-            weights_index_name = ".".join(PADDLE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 10):
-                weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00020"] + ["pdparams"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            for i in range(10, 21):
-                weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-000{i}-of-00020"] + ["pdparams"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="")
-
-            new_model = Qwen3Model.from_pretrained(
-                tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format=""
-            )
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    @require_package("safetensors")
-    def test_checkpoint_variant_local_safe(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format=""
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir, variant="v2", safe_serialization=True, save_to_hf=False, save_checkpoint_format=""
-            )
-
-            weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"])
-
-            weights_file = os.path.join(tmp_dir, weights_name)
-
-            self.assertTrue(os.path.isfile(weights_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME)))
-
-            with self.assertRaises(EnvironmentError):
-                _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="")
-
-            new_model = Qwen3Model.from_pretrained(
-                tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format=""
-            )
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    @require_package("safetensors")
-    def test_checkpoint_variant_local_sharded_safe(self):
-        model = AutoModelForCausalLM.from_pretrained(
-            "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format=""
-        )
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model.save_pretrained(
-                tmp_dir,
-                variant="v2",
-                max_shard_size="50kB",
-                safe_serialization=True,
-                save_to_hf=False,
-                save_checkpoint_format="",
-            )
-
-            weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"])
-            weights_index_file = os.path.join(tmp_dir, weights_index_name)
-            self.assertTrue(os.path.isfile(weights_index_file))
-            self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)))
-
-            for i in range(1, 10):
-                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00020"] + ["safetensors"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            for i in range(10, 21):
-                weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-000{i}-of-00020"] + ["safetensors"])
-                weights_name_file = os.path.join(tmp_dir, weights_name)
-                self.assertTrue(os.path.isfile(weights_name_file))
-
-            with self.assertRaises(EnvironmentError):
-                _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="")
-
-            new_model = Qwen3Model.from_pretrained(
-                tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format=""
-            )
-
-        for p1, p2 in zip(model.parameters(), new_model.parameters()):
-            self.assertTrue(paddle.allclose(p1, p2))
-
-    def test_checkpoint_variant_hub(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = LlamaModel.from_pretrained(
-                    "Paddleformers/tiny-random-llama-variant",
-                    cache_dir=tmp_dir,
-                    convert_from_hf=False,
-                    load_checkpoint_format="",
-                )
-
-            model = LlamaModel.from_pretrained(
-                "Paddleformers/tiny-random-llama-variant",
-                cache_dir=tmp_dir,
-                variant="v2",
-                convert_from_hf=False,
-                load_checkpoint_format="",
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_hub_sharded(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            with self.assertRaises(EnvironmentError):
-                _ = LlamaModel.from_pretrained(
-                    "Paddleformers/tiny-random-llama-variant-sharded",
-                    cache_dir=tmp_dir,
-                    convert_from_hf=False,
-                    load_checkpoint_format="",
-                )
-            model = LlamaModel.from_pretrained(
-                "Paddleformers/tiny-random-llama-variant-sharded",
-                cache_dir=tmp_dir,
-                variant="v2",
-                convert_from_hf=False,
-                load_checkpoint_format="",
-            )
-        self.assertIsNotNone(model)
-
-    def test_checkpoint_variant_save_load(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            model = LlamaModel.from_pretrained(
-                "Paddleformers/tiny-random-llama-variant",
-                cache_dir=tmp_dir,
-                variant="v2",
-                convert_from_hf=False,
-                load_checkpoint_format="",
-            )
-            weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"])
-
-            model.save_pretrained(tmp_dir, variant="v2", save_to_hf=False, save_checkpoint_format="")
-            # saving will create a variant checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            model.save_pretrained(tmp_dir, save_to_hf=False, save_checkpoint_format="")
-            # saving shouldn't delete variant checkpoints
-            weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"])
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name)))
-
-            # there should be a normal checkpoint
-            self.assertTrue(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME)))
-
-        self.assertIsNotNone(model)