PaddlePaddle · Feiye0979 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/docs/zh/dpo_and_lora_guide.md b/docs/zh/dpo_and_lora_guide.md
@@ -77,7 +77,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -135,7 +135,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -187,7 +187,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-PT`，推荐使用 SFT 后的模型
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `lora`：Bool 类型，是否 lora 训练，默认`False`。
 

diff --git a/docs/zh/pt_and_cpt_guide.md b/docs/zh/pt_and_cpt_guide.md
@@ -59,7 +59,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -108,7 +108,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-Base-PT`
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `stage`：与训练类型相关，预训练设置`PT`
 

diff --git a/docs/zh/sft_and_lora_guide.md b/docs/zh/sft_and_lora_guide.md
@@ -67,7 +67,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -124,7 +124,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -175,7 +175,7 @@ load_checkpoint_format: flex_checkpoint
 
 `model_name_or_path`：模型本地路径或 HuggingFace 仓库对应的名称，如`baidu/ERNIE-4.5-0.3B-Base-PT`
 
-`attn_impl`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
+`_attn_implementation`：模型 Attention Mask 实现方式，推荐使用 `flashmask`，是一种针对 FlashAttention 的一种核心优化技术。
 
 `lora`：Bool 类型，是否 lora 训练，默认`False`。
 

diff --git a/docs/zh/training_arguments.md b/docs/zh/training_arguments.md
@@ -283,7 +283,7 @@
   --expert_model_parallel_size
                         专家并行的并行度。(`int`, 可选)
 
-  --aux_loss_alpha
+  --router_aux_loss_coef
                         MoE 模型的辅助损失（Auxiliary loss）权重系数。(`float`, 可选, 默认为 0.0001)
 
   --expert_max_capacity

diff --git a/examples/best_practices/DeepSeek-V3/SFT-Practice.md b/examples/best_practices/DeepSeek-V3/SFT-Practice.md
@@ -80,4 +80,4 @@ mpirun bash run_dsv3_4k.sh
 * 在 MoE 模型中，专家间负载不均衡也可能引发 OOM 错误。为此，合理引入 AuxLoss 及其无辅助损失机制至关重要。以下是实验过程中总结的关键注意事项：
     * Gate 计算隔离：e_score_correction_bias 应仅用于门控权重计算，避免传递至后续 FFN 模块。
     * AuxLoss 计算适配：在 SP 或 Subbatch 等并行策略下，需注意 seq_len 的实际取值，确保损失计算正确。
-    * 配置调整：Hugging Face 所提供的部分配置（如 aux_loss_alpha）需结合具体训练场景进行针对性调优。
+    * 配置调整：Hugging Face 所提供的部分配置（如 router_aux_loss_coef）需结合具体训练场景进行针对性调优。
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 1024
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
diff --git a/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml
@@ -75,10 +75,8 @@ sharding: stage1
 bf16: true
 amp_master_grad: true
 fp16_opt_level: O2
-use_flash_attention: true
 use_attn_mask_startend_row_indices: true
-using_fake_gate: false
+moe_router_force_load_balancing: false
 pre_alloc_memory: 60
 tensorwise_offload_optimizer: true
-fuse_rms_norm: true
 moe_subbatch_token_num_before_dispatch: 0
diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/config.json b/examples/best_practices/DeepSeek-V3/pretrain/config/config.json
@@ -9,8 +9,8 @@
       "AutoModel": "DeepseekV2ModelFast",
       "AutoModelForCausalLM": "DeepseekV2ForCausalLM"
     },
-    "aux_loss_alpha": 0.0001,
-    "aux_loss_free_gamma": 0.0,
+    "router_aux_loss_coef": 0.0001,
+    "moe_router_bias_update_rate": 0.0,
     "bos_token_id": 0,
     "eos_token_id": 1,
     "ep_size": 1,
@@ -61,8 +61,6 @@
     "v_head_dim": 128,
     "vocab_size": 129280,
     "using_flex_token": true,
-    "fuse_rms_norm": true,
-    "fuse_attention_ffn": true,
     "apply_rope_fusion": true,
     "token_drop_steps": 0,
     "recompute_fwd_gate_up": true,

diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml b/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml
@@ -23,7 +23,6 @@ expert_model_parallel_size: 2
 sharding: "stage1"
 virtual_pipeline_model_parallel_size: 1
 sequence_parallel: 0
-use_flash_attention: true
 max_seq_len: 4097
 learning_rate: 0.000022
 min_lr: 0.00000073333
@@ -48,8 +47,6 @@ distributed_dataloader: 1
 unified_checkpoint: true
 save_total_limit: 2
 skip_profile_timer: false
-fuse_rms_norm: true
-fuse_attention_ffn: true
 apply_rope_fusion: true
 save_sharded_model: false
 load_sharded_model: false
@@ -58,7 +55,7 @@ unified_checkpoint_config: "ignore_merge_optimizer"
 offload_optim: true
 reorder_pipeline_priority: true
 num_nextn_predict_layers: 1
-using_fake_gate: false
+moe_router_force_load_balancing: false
 hidden_dropout_prob: 0.1
 attention_probs_dropout_prob: 0.1
 pre_alloc_memory: 61
diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -55,7 +55,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true

diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -55,7 +55,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true

diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml
@@ -11,7 +11,7 @@ random_shuffle: false
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 32
 
@@ -57,7 +57,6 @@ recompute_num_layers: 1
 recompute_modules: ["loss_fn"]
 recompute_use_reentrant: true
 
-use_flash_attention: true
 sequence_parallel: true
 pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer
 offload_queue: true

diff --git a/examples/best_practices/PaddleOCR-VL/README.md b/examples/best_practices/PaddleOCR-VL/README.md
@@ -134,7 +134,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -207,7 +207,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 
@@ -728,7 +728,7 @@ CUDA_VISIBLE_DEVICES=0 paddleformers-cli train examples/best_practices/PaddleOCR
                         per_device_train_batch_size=2 \
                         per_device_eval_batch_size=2 \
                         gradient_accumulation_steps=32 \
-                        attn_impl=sdpa \
+                        _attn_implementation=sdpa \
                         pre_alloc_memory=18 \
                         device=iluvatar_gpu
 ```

diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml
@@ -15,7 +15,7 @@ template: paddleocr_vl
 
 ### model
 model_name_or_path: PaddlePaddle/PaddleOCR-VL
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 

diff --git a/examples/best_practices/tutorials/how_to_train_a_function_call_model.md b/examples/best_practices/tutorials/how_to_train_a_function_call_model.md
@@ -218,7 +218,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md b/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md
@@ -188,7 +188,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md b/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md
@@ -444,7 +444,7 @@ template: qwen2_vl
 
 ### model
 model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 lora_alpha: 32

diff --git a/examples/best_practices/tutorials/how_to_train_an_emoji_model.md b/examples/best_practices/tutorials/how_to_train_an_emoji_model.md
@@ -267,7 +267,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base
@@ -408,7 +408,7 @@ mix_strategy: concat
 
 ### model
 model_name_or_path: ./checkpoints/paddleformers_qwen3_0p6b_sft_ckpts_emoji/
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/config/dpo/full.yaml b/examples/config/dpo/full.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/config/dpo/full_function_call.yaml b/examples/config/dpo/full_function_call.yaml
@@ -14,7 +14,7 @@ split_multi_turn: False
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 use_fused_head_and_loss_fn: false
 loss_subbatch_sequence_length: 8192
 

diff --git a/examples/config/dpo/full_tp_pp.yaml b/examples/config/dpo/full_tp_pp.yaml
@@ -14,7 +14,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml
@@ -14,7 +14,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 
 ### finetuning
 # base

diff --git a/examples/config/dpo/lora.yaml b/examples/config/dpo/lora.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 

diff --git a/examples/config/dpo/lora_tp_pp.yaml b/examples/config/dpo/lora_tp_pp.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 

diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml
@@ -13,7 +13,7 @@ template: qwen3
 
 ### model
 model_name_or_path: Qwen/Qwen3-0.6B-Base
-attn_impl: flashmask
+_attn_implementation: flashmask
 lora: true
 lora_rank: 8
 

diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 
 ### finetuning
 # base

diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml
@@ -13,7 +13,7 @@ template: ernie_nothink
 
 ### model
 model_name_or_path: baidu/ERNIE-4.5-0.3B-PT
-attn_impl: eager
+_attn_implementation: eager
 lora: true
 lora_rank: 8