diff --git a/docs/zh/dpo_and_lora_guide.md b/docs/zh/dpo_and_lora_guide.md index a55e6ad3c03..026d2ea3848 100644 --- a/docs/zh/dpo_and_lora_guide.md +++ b/docs/zh/dpo_and_lora_guide.md @@ -77,7 +77,7 @@ mix_strategy: concat ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -135,7 +135,7 @@ mix_strategy: concat ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -187,7 +187,7 @@ load_checkpoint_format: flex_checkpoint `model_name_or_path`:模型本地路径或 HuggingFace 仓库对应的名称,如`baidu/ERNIE-4.5-0.3B-PT`,推荐使用 SFT 后的模型 -`attn_impl`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 +`_attn_implementation`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 `lora`:Bool 类型,是否 lora 训练,默认`False`。 diff --git a/docs/zh/pt_and_cpt_guide.md b/docs/zh/pt_and_cpt_guide.md index 113b3bbb8d0..895c1a11a57 100644 --- a/docs/zh/pt_and_cpt_guide.md +++ b/docs/zh/pt_and_cpt_guide.md @@ -59,7 +59,7 @@ mix_strategy: concat ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -108,7 +108,7 @@ load_checkpoint_format: flex_checkpoint `model_name_or_path`:模型本地路径或 HuggingFace 仓库对应的名称,如`baidu/ERNIE-4.5-0.3B-Base-PT` -`attn_impl`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 +`_attn_implementation`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 `stage`:与训练类型相关,预训练设置`PT` diff --git a/docs/zh/sft_and_lora_guide.md b/docs/zh/sft_and_lora_guide.md index bba9cb77d86..9f2bd40067d 100644 --- a/docs/zh/sft_and_lora_guide.md +++ b/docs/zh/sft_and_lora_guide.md @@ -67,7 +67,7 @@ mix_strategy: concat ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -124,7 +124,7 @@ mix_strategy: concat ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-Base-PT -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -175,7 +175,7 @@ load_checkpoint_format: flex_checkpoint `model_name_or_path`:模型本地路径或 HuggingFace 仓库对应的名称,如`baidu/ERNIE-4.5-0.3B-Base-PT` -`attn_impl`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 +`_attn_implementation`:模型 Attention Mask 实现方式,推荐使用 `flashmask`,是一种针对 FlashAttention 的一种核心优化技术。 `lora`:Bool 类型,是否 lora 训练,默认`False`。 diff --git a/docs/zh/training_arguments.md b/docs/zh/training_arguments.md index 8adb97b1a17..bdd064981da 100644 --- a/docs/zh/training_arguments.md +++ b/docs/zh/training_arguments.md @@ -283,7 +283,7 @@ --expert_model_parallel_size 专家并行的并行度。(`int`, 可选) - --aux_loss_alpha + --router_aux_loss_coef MoE 模型的辅助损失(Auxiliary loss)权重系数。(`float`, 可选, 默认为 0.0001) --expert_max_capacity diff --git a/examples/best_practices/DeepSeek-V3/SFT-Practice.md b/examples/best_practices/DeepSeek-V3/SFT-Practice.md index 2bbff46f2f9..8304b028e75 100644 --- a/examples/best_practices/DeepSeek-V3/SFT-Practice.md +++ b/examples/best_practices/DeepSeek-V3/SFT-Practice.md @@ -80,4 +80,4 @@ mpirun bash run_dsv3_4k.sh * 在 MoE 模型中,专家间负载不均衡也可能引发 OOM 错误。为此,合理引入 AuxLoss 及其无辅助损失机制至关重要。以下是实验过程中总结的关键注意事项: * Gate 计算隔离:e_score_correction_bias 应仅用于门控权重计算,避免传递至后续 FFN 模块。 * AuxLoss 计算适配:在 SP 或 Subbatch 等并行策略下,需注意 seq_len 的实际取值,确保损失计算正确。 - * 配置调整:Hugging Face 所提供的部分配置(如 aux_loss_alpha)需结合具体训练场景进行针对性调优。 + * 配置调整:Hugging Face 所提供的部分配置(如 router_aux_loss_coef)需结合具体训练场景进行针对性调优。 diff --git a/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml index ef14fb3aa44..35540d96472 100644 --- a/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml +++ b/examples/best_practices/DeepSeek-V3/dsv3_128k_config.yaml @@ -75,10 +75,8 @@ sharding: stage1 bf16: true amp_master_grad: true fp16_opt_level: O2 -use_flash_attention: true use_attn_mask_startend_row_indices: true -using_fake_gate: false +moe_router_force_load_balancing: false pre_alloc_memory: 60 tensorwise_offload_optimizer: true -fuse_rms_norm: true moe_subbatch_token_num_before_dispatch: 1024 \ No newline at end of file diff --git a/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml index 0092964981c..8f5d40e0b56 100644 --- a/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml +++ b/examples/best_practices/DeepSeek-V3/dsv3_32k_config.yaml @@ -75,10 +75,8 @@ sharding: stage1 bf16: true amp_master_grad: true fp16_opt_level: O2 -use_flash_attention: true use_attn_mask_startend_row_indices: true -using_fake_gate: false +moe_router_force_load_balancing: false pre_alloc_memory: 60 tensorwise_offload_optimizer: true -fuse_rms_norm: true moe_subbatch_token_num_before_dispatch: 0 \ No newline at end of file diff --git a/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml b/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml index c0f48ac740e..06e9a6d0fb3 100644 --- a/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml +++ b/examples/best_practices/DeepSeek-V3/dsv3_4k_config.yaml @@ -75,10 +75,8 @@ sharding: stage1 bf16: true amp_master_grad: true fp16_opt_level: O2 -use_flash_attention: true use_attn_mask_startend_row_indices: true -using_fake_gate: false +moe_router_force_load_balancing: false pre_alloc_memory: 60 tensorwise_offload_optimizer: true -fuse_rms_norm: true moe_subbatch_token_num_before_dispatch: 0 \ No newline at end of file diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/config.json b/examples/best_practices/DeepSeek-V3/pretrain/config/config.json index 8e64a1615dd..ee0afa87007 100644 --- a/examples/best_practices/DeepSeek-V3/pretrain/config/config.json +++ b/examples/best_practices/DeepSeek-V3/pretrain/config/config.json @@ -9,8 +9,8 @@ "AutoModel": "DeepseekV2ModelFast", "AutoModelForCausalLM": "DeepseekV2ForCausalLM" }, - "aux_loss_alpha": 0.0001, - "aux_loss_free_gamma": 0.0, + "router_aux_loss_coef": 0.0001, + "moe_router_bias_update_rate": 0.0, "bos_token_id": 0, "eos_token_id": 1, "ep_size": 1, @@ -61,8 +61,6 @@ "v_head_dim": 128, "vocab_size": 129280, "using_flex_token": true, - "fuse_rms_norm": true, - "fuse_attention_ffn": true, "apply_rope_fusion": true, "token_drop_steps": 0, "recompute_fwd_gate_up": true, diff --git a/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml b/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml index 98980deed53..11d09900ec6 100644 --- a/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml +++ b/examples/best_practices/DeepSeek-V3/pretrain/config/pretrain_argument.yaml @@ -23,7 +23,6 @@ expert_model_parallel_size: 2 sharding: "stage1" virtual_pipeline_model_parallel_size: 1 sequence_parallel: 0 -use_flash_attention: true max_seq_len: 4097 learning_rate: 0.000022 min_lr: 0.00000073333 @@ -48,8 +47,6 @@ distributed_dataloader: 1 unified_checkpoint: true save_total_limit: 2 skip_profile_timer: false -fuse_rms_norm: true -fuse_attention_ffn: true apply_rope_fusion: true save_sharded_model: false load_sharded_model: false @@ -58,7 +55,7 @@ unified_checkpoint_config: "ignore_merge_optimizer" offload_optim: true reorder_pipeline_priority: true num_nextn_predict_layers: 1 -using_fake_gate: false +moe_router_force_load_balancing: false hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 pre_alloc_memory: 61 \ No newline at end of file diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml index 56a4bd83f9d..7496d7741cf 100644 --- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml +++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_32k_config.yaml @@ -11,7 +11,7 @@ random_shuffle: false ### model model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -55,7 +55,6 @@ recompute_num_layers: 1 recompute_modules: ["loss_fn"] recompute_use_reentrant: true -use_flash_attention: true sequence_parallel: true pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer offload_queue: true diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml index 3b1e6a95b58..e519f71bc30 100644 --- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml +++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_config.yaml @@ -11,7 +11,7 @@ random_shuffle: false ### model model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -55,7 +55,6 @@ recompute_num_layers: 1 recompute_modules: ["loss_fn"] recompute_use_reentrant: true -use_flash_attention: true sequence_parallel: true pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer offload_queue: true diff --git a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml index b49aae15c08..64b82d9435c 100644 --- a/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml +++ b/examples/best_practices/ERNIE-4.5-VL/ernie45vl_8k_lora_config.yaml @@ -11,7 +11,7 @@ random_shuffle: false ### model model_name_or_path: baidu/ERNIE-4.5-VL-28B-A3B-Thinking -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 32 @@ -57,7 +57,6 @@ recompute_num_layers: 1 recompute_modules: ["loss_fn"] recompute_use_reentrant: true -use_flash_attention: true sequence_parallel: true pp_seg_method: layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer offload_queue: true diff --git a/examples/best_practices/PaddleOCR-VL/README.md b/examples/best_practices/PaddleOCR-VL/README.md index f2e3a3d53b6..980f7665e63 100644 --- a/examples/best_practices/PaddleOCR-VL/README.md +++ b/examples/best_practices/PaddleOCR-VL/README.md @@ -134,7 +134,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -207,7 +207,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -728,7 +728,7 @@ CUDA_VISIBLE_DEVICES=0 paddleformers-cli train examples/best_practices/PaddleOCR per_device_train_batch_size=2 \ per_device_eval_batch_size=2 \ gradient_accumulation_steps=32 \ - attn_impl=sdpa \ + _attn_implementation=sdpa \ pre_alloc_memory=18 \ device=iluvatar_gpu ``` diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml index 2bb001d40d2..6d1e3debe44 100644 --- a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml +++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_full_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml index 6f4cbf00a0c..18ec25325ef 100644 --- a/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml +++ b/examples/best_practices/PaddleOCR-VL/paddleocr-vl_lora_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/best_practices/tutorials/how_to_train_a_function_call_model.md b/examples/best_practices/tutorials/how_to_train_a_function_call_model.md index 1301a774a3e..a3f09e83833 100644 --- a/examples/best_practices/tutorials/how_to_train_a_function_call_model.md +++ b/examples/best_practices/tutorials/how_to_train_a_function_call_model.md @@ -218,7 +218,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md b/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md index 49553a08a77..e2674eb538a 100644 --- a/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md +++ b/examples/best_practices/tutorials/how_to_train_a_reasoning_model.md @@ -188,7 +188,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md b/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md index f237076bc9e..9f2d4bceb26 100644 --- a/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md +++ b/examples/best_practices/tutorials/how_to_train_a_visual_grounding_model.md @@ -444,7 +444,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-7B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 lora_alpha: 32 diff --git a/examples/best_practices/tutorials/how_to_train_an_emoji_model.md b/examples/best_practices/tutorials/how_to_train_an_emoji_model.md index 6fda3e9403d..30dc9c711c6 100644 --- a/examples/best_practices/tutorials/how_to_train_an_emoji_model.md +++ b/examples/best_practices/tutorials/how_to_train_an_emoji_model.md @@ -267,7 +267,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -408,7 +408,7 @@ mix_strategy: concat ### model model_name_or_path: ./checkpoints/paddleformers_qwen3_0p6b_sft_ckpts_emoji/ -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/dpo/full.yaml b/examples/config/dpo/full.yaml index d602ebaa13f..333383b762d 100644 --- a/examples/config/dpo/full.yaml +++ b/examples/config/dpo/full.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/dpo/full_function_call.yaml b/examples/config/dpo/full_function_call.yaml index 6484f9a9edb..fd5e2c01fe1 100644 --- a/examples/config/dpo/full_function_call.yaml +++ b/examples/config/dpo/full_function_call.yaml @@ -14,7 +14,7 @@ split_multi_turn: False ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask use_fused_head_and_loss_fn: false loss_subbatch_sequence_length: 8192 diff --git a/examples/config/dpo/full_tp_pp.yaml b/examples/config/dpo/full_tp_pp.yaml index b17fe3c0f10..78a930e9946 100644 --- a/examples/config/dpo/full_tp_pp.yaml +++ b/examples/config/dpo/full_tp_pp.yaml @@ -14,7 +14,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml index fe42e75d935..d8c23af56f3 100644 --- a/examples/config/dpo/full_tp_pp_ep.yaml +++ b/examples/config/dpo/full_tp_pp_ep.yaml @@ -14,7 +14,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/dpo/lora.yaml b/examples/config/dpo/lora.yaml index 1e9554a09d5..958d570e323 100644 --- a/examples/config/dpo/lora.yaml +++ b/examples/config/dpo/lora.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/dpo/lora_tp_pp.yaml b/examples/config/dpo/lora_tp_pp.yaml index fe4bc3a5feb..6310342b0dc 100644 --- a/examples/config/dpo/lora_tp_pp.yaml +++ b/examples/config/dpo/lora_tp_pp.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml index ee1792e4c25..e3129f24ea9 100644 --- a/examples/config/dpo/lora_tp_pp_ep.yaml +++ b/examples/config/dpo/lora_tp_pp_ep.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml index c8c353289de..4992234d014 100644 --- a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml +++ b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/full_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: eager +_attn_implementation: eager ### finetuning # base diff --git a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml index 6c12c1aa070..c4b2e1935a8 100644 --- a/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml +++ b/examples/config/iluvatar/ERNIE-4.5-0.3B-PT/sft/lora_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: eager +_attn_implementation: eager lora: true lora_rank: 8 diff --git a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml index f5d5a012ce3..a1c3ee5a94a 100644 --- a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml +++ b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/full_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: eager +_attn_implementation: eager ### finetuning # base diff --git a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml index 13a48ad5109..f26f0f3161b 100644 --- a/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml +++ b/examples/config/iluvatar/ERNIE-4.5-21B-A3B-PT/sft/lora_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: eager +_attn_implementation: eager lora: true lora_rank: 8 diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml index 4fa8d6e0dfb..f2cb052e5ca 100644 --- a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: sdpa +_attn_implementation: sdpa ### finetuning # base diff --git a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml index de7d9417d37..c83f56e2cc2 100644 --- a/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml +++ b/examples/config/iluvatar/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: sdpa +_attn_implementation: sdpa lora: true lora_rank: 8 diff --git a/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml b/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml index 77ee09f5f19..8b6747eee9c 100644 --- a/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml +++ b/examples/config/metax/ERNIE-4.5-0.3B/sft/lora.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: eager +_attn_implementation: eager lora: true lora_rank: 8 diff --git a/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml b/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml index debf40fc597..d9c6d31fdf6 100644 --- a/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml +++ b/examples/config/metax/ERNIE-4.5-0.3B/sft/sft.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: eager +_attn_implementation: eager ### finetuning # base diff --git a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml index ebaf2c5944c..045899aac7d 100644 --- a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml +++ b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/lora.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: eager +_attn_implementation: eager lora: true lora_rank: 8 diff --git a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml index ac28e16e106..91ed81b80b6 100644 --- a/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml +++ b/examples/config/metax/ERNIE-4.5-21B-A3B/sft/sft.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: eager +_attn_implementation: eager ### finetuning # base diff --git a/examples/config/pt/full.yaml b/examples/config/pt/full.yaml index 74c59f5e8fa..a2e0d46b70e 100644 --- a/examples/config/pt/full.yaml +++ b/examples/config/pt/full.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/pt/full_offline_data.yaml b/examples/config/pt/full_offline_data.yaml index d2adcb32a96..2ea68f339f8 100644 --- a/examples/config/pt/full_offline_data.yaml +++ b/examples/config/pt/full_offline_data.yaml @@ -7,7 +7,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/pt/full_tp_pp.yaml b/examples/config/pt/full_tp_pp.yaml index a54f2942716..f4fc4e0f9a9 100644 --- a/examples/config/pt/full_tp_pp.yaml +++ b/examples/config/pt/full_tp_pp.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/pt/full_tp_pp_ep.yaml b/examples/config/pt/full_tp_pp_ep.yaml index f724b18e2de..1f6c4e6edad 100644 --- a/examples/config/pt/full_tp_pp_ep.yaml +++ b/examples/config/pt/full_tp_pp_ep.yaml @@ -12,7 +12,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/pt/lora.yaml b/examples/config/pt/lora.yaml index 1ac3323e5ea..f88bc71612d 100644 --- a/examples/config/pt/lora.yaml +++ b/examples/config/pt/lora.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/pt/lora_tp_pp.yaml b/examples/config/pt/lora_tp_pp.yaml index 224d6220128..aec0078170f 100644 --- a/examples/config/pt/lora_tp_pp.yaml +++ b/examples/config/pt/lora_tp_pp.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/pt/lora_tp_pp_ep.yaml b/examples/config/pt/lora_tp_pp_ep.yaml index 40afabeaaa1..d2800c73d9d 100644 --- a/examples/config/pt/lora_tp_pp_ep.yaml +++ b/examples/config/pt/lora_tp_pp_ep.yaml @@ -12,7 +12,7 @@ mix_strategy: concat ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft-vl/full.yaml b/examples/config/sft-vl/full.yaml index 73d98507e37..9667f143561 100644 --- a/examples/config/sft-vl/full.yaml +++ b/examples/config/sft-vl/full.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft-vl/full_fsdp.yaml b/examples/config/sft-vl/full_fsdp.yaml index 35e2525d495..323b6fd645e 100644 --- a/examples/config/sft-vl/full_fsdp.yaml +++ b/examples/config/sft-vl/full_fsdp.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft-vl/full_tp.yaml b/examples/config/sft-vl/full_tp.yaml index e7faba53ee7..c2364495a41 100644 --- a/examples/config/sft-vl/full_tp.yaml +++ b/examples/config/sft-vl/full_tp.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft-vl/lora.yaml b/examples/config/sft-vl/lora.yaml index 89b8db42029..f7f80245f23 100644 --- a/examples/config/sft-vl/lora.yaml +++ b/examples/config/sft-vl/lora.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/sft-vl/lora_fsdp.yaml b/examples/config/sft-vl/lora_fsdp.yaml index e9704e1ac58..694b2009f2d 100644 --- a/examples/config/sft-vl/lora_fsdp.yaml +++ b/examples/config/sft-vl/lora_fsdp.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/sft-vl/lora_tp.yaml b/examples/config/sft-vl/lora_tp.yaml index a42c8bc60d5..b674d751692 100644 --- a/examples/config/sft-vl/lora_tp.yaml +++ b/examples/config/sft-vl/lora_tp.yaml @@ -13,7 +13,7 @@ template: qwen2_vl ### model model_name_or_path: Qwen/Qwen2.5-VL-3B-Instruct -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/sft/full.yaml b/examples/config/sft/full.yaml index be52732f89f..7c5907060d7 100644 --- a/examples/config/sft/full.yaml +++ b/examples/config/sft/full.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft/full_function_call.yaml b/examples/config/sft/full_function_call.yaml index e2edda92d93..ce7d2f58c91 100644 --- a/examples/config/sft/full_function_call.yaml +++ b/examples/config/sft/full_function_call.yaml @@ -14,7 +14,7 @@ split_multi_turn: False ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask use_fused_head_and_loss_fn: False loss_subbatch_sequence_length: 8192 diff --git a/examples/config/sft/full_tp_pp.yaml b/examples/config/sft/full_tp_pp.yaml index af75e061e1f..dfba5e4a420 100644 --- a/examples/config/sft/full_tp_pp.yaml +++ b/examples/config/sft/full_tp_pp.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft/full_tp_pp_ep.yaml b/examples/config/sft/full_tp_pp_ep.yaml index fb754168c5f..2250a2cb3e0 100644 --- a/examples/config/sft/full_tp_pp_ep.yaml +++ b/examples/config/sft/full_tp_pp_ep.yaml @@ -14,7 +14,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/sft/lora.yaml b/examples/config/sft/lora.yaml index 41b24a597da..9601cea349e 100644 --- a/examples/config/sft/lora.yaml +++ b/examples/config/sft/lora.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/sft/lora_tp_pp.yaml b/examples/config/sft/lora_tp_pp.yaml index b88d0d82303..8495b7935d2 100644 --- a/examples/config/sft/lora_tp_pp.yaml +++ b/examples/config/sft/lora_tp_pp.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/sft/lora_tp_pp_ep.yaml b/examples/config/sft/lora_tp_pp_ep.yaml index 1c3ab1a87b9..cea763c69fe 100644 --- a/examples/config/sft/lora_tp_pp_ep.yaml +++ b/examples/config/sft/lora_tp_pp_ep.yaml @@ -13,7 +13,7 @@ template: qwen3 ### model model_name_or_path: Qwen/Qwen3-0.6B-Base -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml b/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml index 71eba3c7955..301277e0fd9 100644 --- a/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml +++ b/examples/config/xpu/DeepseekV3/sft/full_32k_config.yaml @@ -75,11 +75,9 @@ sharding: stage1 bf16: true amp_master_grad: true fp16_opt_level: O2 -use_flash_attention: true use_attn_mask_startend_row_indices: true -using_fake_gate: false +moe_router_force_load_balancing: false pre_alloc_memory: 60 tensorwise_offload_optimizer: true -fuse_rms_norm: true moe_subbatch_token_num_before_dispatch: 0 device: xpu \ No newline at end of file diff --git a/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml b/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml index 2dc1856195d..ca16bbda4d9 100644 --- a/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml +++ b/examples/config/xpu/DeepseekV3/sft/full_4k_config.yaml @@ -75,11 +75,9 @@ sharding: stage1 bf16: true amp_master_grad: true fp16_opt_level: O2 -use_flash_attention: true use_attn_mask_startend_row_indices: true -using_fake_gate: false +moe_router_force_load_balancing: false pre_alloc_memory: 60 tensorwise_offload_optimizer: true -fuse_rms_norm: true moe_subbatch_token_num_before_dispatch: 0 device: xpu \ No newline at end of file diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml index 0a293e883a3..0ac32f661cc 100644 --- a/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml +++ b/examples/config/xpu/ERNIE-4.5-0.3B/sft/full_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml index 9cbf220164c..f1c120823cd 100644 --- a/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml +++ b/examples/config/xpu/ERNIE-4.5-0.3B/sft/lora_8k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-0.3B-PT -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml index 26211d2f88a..273caaf378b 100644 --- a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml +++ b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/full_32k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml index 2f29a7ee833..b1cbb0dc7fa 100644 --- a/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml +++ b/examples/config/xpu/ERNIE-4.5-21B-A3B/sft/lora_32k.yaml @@ -13,7 +13,7 @@ template: ernie_nothink ### model model_name_or_path: baidu/ERNIE-4.5-21B-A3B-PT -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml index 9aa887f5b50..6aef90622e2 100644 --- a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml +++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_full_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml index 093722c055f..0bba74dcb82 100644 --- a/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml +++ b/examples/config/xpu/PaddleOCR-VL/sft/paddleocr-vl_lora_16k_config.yaml @@ -15,7 +15,7 @@ template: paddleocr_vl ### model model_name_or_path: PaddlePaddle/PaddleOCR-VL -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 diff --git a/examples/experiments/deepseek_v3_pretrain/config/config.json b/examples/experiments/deepseek_v3_pretrain/config/config.json index 8e64a1615dd..ee0afa87007 100644 --- a/examples/experiments/deepseek_v3_pretrain/config/config.json +++ b/examples/experiments/deepseek_v3_pretrain/config/config.json @@ -9,8 +9,8 @@ "AutoModel": "DeepseekV2ModelFast", "AutoModelForCausalLM": "DeepseekV2ForCausalLM" }, - "aux_loss_alpha": 0.0001, - "aux_loss_free_gamma": 0.0, + "router_aux_loss_coef": 0.0001, + "moe_router_bias_update_rate": 0.0, "bos_token_id": 0, "eos_token_id": 1, "ep_size": 1, @@ -61,8 +61,6 @@ "v_head_dim": 128, "vocab_size": 129280, "using_flex_token": true, - "fuse_rms_norm": true, - "fuse_attention_ffn": true, "apply_rope_fusion": true, "token_drop_steps": 0, "recompute_fwd_gate_up": true, diff --git a/examples/experiments/deepseek_v3_pretrain/config/configuration.py b/examples/experiments/deepseek_v3_pretrain/config/configuration.py index 4c475d427ba..53fceddc57b 100644 --- a/examples/experiments/deepseek_v3_pretrain/config/configuration.py +++ b/examples/experiments/deepseek_v3_pretrain/config/configuration.py @@ -69,7 +69,7 @@ class DeepseekV2FastConfig(PretrainedConfig): Whether to normalize the weights of the routed experts. scoring_func (`str`, *optional*, defaults to 'softmax'): Method of computing expert weights. - aux_loss_alpha (`float`, *optional*, defaults to 0.001): + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): Auxiliary loss weight coefficient. seq_aux = (`bool`, *optional*, defaults to True): Whether to compute the auxiliary loss for each individual sample. @@ -159,7 +159,7 @@ def __init__( first_k_dense_replace=0, norm_topk_prob=False, scoring_func="softmax", - aux_loss_alpha=0.001, + router_aux_loss_coef=0.001, seq_aux=True, hidden_act="silu", max_position_embeddings=2048, @@ -234,7 +234,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha + self.router_aux_loss_coef = router_aux_loss_coef self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json index fd12017e217..31f6e91e008 100644 --- a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json +++ b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.json @@ -17,7 +17,6 @@ "sharding": "stage1", "virtual_pipeline_model_parallel_size": 1, "sequence_parallel": 0, - "use_flash_attention": true, "max_seq_length": 4097, "learning_rate": 2.2e-05, "min_learning_rate": 7.333e-07, @@ -43,8 +42,6 @@ "unified_checkpoint": true, "save_total_limit": 2, "skip_profile_timer": false, - "fuse_rms_norm": true, - "fuse_attention_ffn": true, "apply_rope_fusion": true, "save_sharded_model": false, "load_sharded_model": false, @@ -53,6 +50,6 @@ "offload_optim": true, "reorder_pipeline_priority": true, "num_nextn_predict_layers":1, - "using_fake_gate": false, + "moe_router_force_load_balancing": false, "fa_version": 3 } \ No newline at end of file diff --git a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml index d4893c8e1ae..fcdbd662f44 100644 --- a/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml +++ b/examples/experiments/deepseek_v3_pretrain/config/pretrain_argument.yaml @@ -23,7 +23,6 @@ expert_model_parallel_size: 32 sharding: "stage1" virtual_pipeline_model_parallel_size: 1 sequence_parallel: 0 -use_flash_attention: true max_seq_len: 4097 learning_rate: 0.000022 min_lr: 0.00000073333 @@ -48,8 +47,6 @@ distributed_dataloader: 1 unified_checkpoint: true save_total_limit: 2 skip_profile_timer: false -fuse_rms_norm: true -fuse_attention_ffn: true apply_rope_fusion: true save_sharded_model: false load_sharded_model: false @@ -58,7 +55,7 @@ unified_checkpoint_config: "ignore_merge_optimizer" offload_optim: true reorder_pipeline_priority: true num_nextn_predict_layers: 1 -using_fake_gate: false +moe_router_force_load_balancing: false hidden_dropout_prob: 0.1 attention_probs_dropout_prob: 0.1 pre_alloc_memory: 61 \ No newline at end of file diff --git a/examples/experiments/deepseek_v3_pretrain/modeling.py b/examples/experiments/deepseek_v3_pretrain/modeling.py index 26ab8c34a0d..7bf7e250261 100644 --- a/examples/experiments/deepseek_v3_pretrain/modeling.py +++ b/examples/experiments/deepseek_v3_pretrain/modeling.py @@ -258,7 +258,6 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, intermediate_ self.config = config self.hidden_size = config.hidden_size if hidden_size is None else hidden_size self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size - self.fuse_attention_ffn = config.fuse_attention_ffn Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_ def linear_dtype_gaurd(): @@ -295,20 +294,13 @@ def linear_dtype_gaurd(): has_bias=False, ) else: - if config.fuse_attention_ffn: - self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) - else: - self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False) self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.fuse_attention_ffn: - x = swiglu(self.gate_up_fused_proj(x)) - else: - x = swiglu(self.gate_proj(x), self.up_proj(x)) + x = swiglu(self.gate_up_fused_proj(x)) out = self.down_proj(x) return out @@ -370,7 +362,7 @@ def forward(self, hidden_states): # compute gating score if self.using_post_norm_recompute: logits, norm_out = FusedNormGateFunc.apply(hidden_states, self.norm_weight, self.weight, self.norm_eps) - if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate: + if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing: logits = FakeGate.apply( hidden_states, self.weight, @@ -380,7 +372,10 @@ def forward(self, hidden_states): else: with paddle.amp.auto_cast(False): hidden_states = hidden_states.cast(self.weight.dtype) - if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate: + if ( + hasattr(self.config, "moe_router_force_load_balancing") + and self.config.moe_router_force_load_balancing + ): logits = FakeGate.apply( hidden_states, self.weight, @@ -473,7 +468,7 @@ def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, norm_eps=None p.expert = not self.is_mp_moe logger.info(f"expert no-sync={p.no_sync}-{p.name}") - self.alpha = config.aux_loss_alpha + self.alpha = config.router_aux_loss_coef if config.n_shared_experts is not None: intermediate_size = config.moe_intermediate_size * config.n_shared_experts if self.using_post_norm_recompute: @@ -1658,8 +1653,7 @@ def forward( attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), past_key_values_length, inputs_embeds.dtype ) # [bs, 1, seq_len, seq_len] - if self.config.use_flash_attention: - attention_mask = None if is_casual_mask(attention_mask) else attention_mask + attention_mask = None if is_casual_mask(attention_mask) else attention_mask if self.config.num_nextn_predict_layers > 0: inputs_embeds_extra = inputs_embeds[:, -self.config.num_nextn_predict_layers :, :] # [B, S, D] @@ -1982,7 +1976,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use mark_as_sequence_parallel_parameter(self.weight) def forward(self, hidden_states): - if self.config.fuse_rms_norm: + if True: return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon) with paddle.amp.auto_cast(False): diff --git a/examples/experiments/deepseek_v3_pretrain/run_pretrain.py b/examples/experiments/deepseek_v3_pretrain/run_pretrain.py index 12f7556dd82..3948b5a0033 100644 --- a/examples/experiments/deepseek_v3_pretrain/run_pretrain.py +++ b/examples/experiments/deepseek_v3_pretrain/run_pretrain.py @@ -566,9 +566,7 @@ def main(): # config.using_flex_token = True # config.num_nextn_predict_layers = 1 - # config.using_fake_gate = True - # config.fuse_rms_norm = True - # config.fuse_attention_ffn = True + # config.moe_router_force_load_balancing = True # config.apply_rope_fusion = True # config.token_drop_steps = 0 model = model_class.from_config(config, dtype=dtype) @@ -626,8 +624,8 @@ def main(): callbacks += [MoeExpertsGradScaleCallback(training_args)] if getattr(config, "topk_method", None) == "noaux_tc": - aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001) - callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)] + moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001) + callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)] def resume_from_custom_func(model): if training_args.resume_from_huggingface_ckpt: diff --git a/examples/experiments/ernie_pretrain/ernie/model_config.py b/examples/experiments/ernie_pretrain/ernie/model_config.py index 6548a179ba9..48454181fdc 100644 --- a/examples/experiments/ernie_pretrain/ernie/model_config.py +++ b/examples/experiments/ernie_pretrain/ernie/model_config.py @@ -103,7 +103,7 @@ class ModelConfig: neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"}) neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"}) flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."}) - attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"}) + _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"}) # long sequence strategy use_long_sequence_strategies: bool = field( diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json index 8d5affa0756..f36f380e2db 100644 --- a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json +++ b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-21B-A3B/model_config.json @@ -26,7 +26,6 @@ "use_recompute_moe": false, "use_recompute_loss_fn": false, "use_rmsnorm": true, - "fuse_rms_norm": true, "use_bias": false, "use_fast_ln": true, "fuse_attn_ffn": true, diff --git a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json index 720e97bf525..cf0428004ce 100644 --- a/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json +++ b/examples/experiments/ernie_pretrain/model_configs/ERNIE-4p5-300B-A47B/model_config.json @@ -26,7 +26,6 @@ "use_recompute_moe": false, "use_recompute_loss_fn": false, "use_rmsnorm": true, - "fuse_rms_norm": true, "use_bias": false, "fuse_attn_ffn": true, "fuse_linear": true, diff --git a/examples/experiments/ernie_pretrain/models/ernie/configuration.py b/examples/experiments/ernie_pretrain/models/ernie/configuration.py index d3b1cc2ba7e..072e602ef97 100644 --- a/examples/experiments/ernie_pretrain/models/ernie/configuration.py +++ b/examples/experiments/ernie_pretrain/models/ernie/configuration.py @@ -88,7 +88,7 @@ def __init__( use_recompute_attn=False, recompute_use_reentrant=False, use_rmsnorm=True, - fuse_rms_norm=False, + fuse_rms_norm=True, fuse_ln=False, pad_token_id=0, bos_token_id=1, diff --git a/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py b/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py index aa1287a5420..901ce17ed0a 100644 --- a/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py +++ b/examples/experiments/ernie_pretrain/models/ernie/modeling_moe.py @@ -795,7 +795,7 @@ def __init__(self, config, layer_idx): self.use_rms_qkv_recompute = config.use_rms_qkv_recompute if config.use_rms_qkv_recompute is True: - assert config.use_rmsnorm is True and config.fuse_rms_norm is True + assert config.use_rmsnorm is True assert config.fuse_linear is True and config.use_bias is False assert self.fuse_attn is True @@ -1012,7 +1012,7 @@ def __init__(self, config, layer_idx): if self.use_linear_residual_norm_recompute is True: assert config.hidden_dropout_prob == 0.0 assert config.fuse_linear is True and config.use_bias is False - assert config.use_rmsnorm is True and config.fuse_rms_norm is True + assert config.use_rmsnorm is True self.fused_linear_add_norm = FusedLinearAddNorm(self.hidden_size, config.rms_norm_eps) del self.self_attn.o_proj else: diff --git a/examples/experiments/paddlefleet/run_pretrain.py b/examples/experiments/paddlefleet/run_pretrain.py index 14619bb6d13..d941721a717 100644 --- a/examples/experiments/paddlefleet/run_pretrain.py +++ b/examples/experiments/paddlefleet/run_pretrain.py @@ -641,8 +641,8 @@ def main(): callbacks += [MoeExpertsGradScaleCallback(training_args)] if getattr(config, "topk_method", None) == "noaux_tc": - aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001) - callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)] + moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001) + callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)] def resume_from_custom_func(model): if training_args.resume_from_huggingface_ckpt: diff --git a/paddleformers/cli/hparams/model_args.py b/paddleformers/cli/hparams/model_args.py index 90aef9f4b6d..130c48e5c57 100644 --- a/paddleformers/cli/hparams/model_args.py +++ b/paddleformers/cli/hparams/model_args.py @@ -122,7 +122,7 @@ class ModelArguments: default=False, metadata={"help": "GPT3 model, use fast layernorm"}, ) - attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"}) + _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"}) fuse_gate_detach_matmul: bool = field( default=True, metadata={"help": "Whether to use the fused gate-detach matmul implementation."}, diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py b/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py index 01af544be36..79c392e6a35 100644 --- a/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py +++ b/paddleformers/cli/train/deepseek_v3_pretrain/configuration.py @@ -68,7 +68,7 @@ class DeepseekV2FastConfig(PretrainedConfig): Whether to normalize the weights of the routed experts. scoring_func (`str`, *optional*, defaults to 'softmax'): Method of computing expert weights. - aux_loss_alpha (`float`, *optional*, defaults to 0.001): + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): Auxiliary loss weight coefficient. seq_aux = (`bool`, *optional*, defaults to True): Whether to compute the auxiliary loss for each individual sample. @@ -158,7 +158,7 @@ def __init__( first_k_dense_replace=0, norm_topk_prob=False, scoring_func="softmax", - aux_loss_alpha=0.001, + router_aux_loss_coef=0.001, seq_aux=True, hidden_act="silu", max_position_embeddings=2048, @@ -233,7 +233,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha + self.router_aux_loss_coef = router_aux_loss_coef self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py b/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py index bb64e4098da..759562d5bfe 100644 --- a/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py +++ b/paddleformers/cli/train/deepseek_v3_pretrain/modeling.py @@ -259,7 +259,6 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, intermediate_ self.config = config self.hidden_size = config.hidden_size if hidden_size is None else hidden_size self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size - self.fuse_attention_ffn = config.fuse_attention_ffn Linear = FP8Linear if self.config.dsv3_use_fp8_gemm else Linear_ def linear_dtype_gaurd(): @@ -296,20 +295,13 @@ def linear_dtype_gaurd(): has_bias=False, ) else: - if config.fuse_attention_ffn: - self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) - else: - self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) - self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False) + self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False) self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False) self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.fuse_attention_ffn: - x = swiglu(self.gate_up_fused_proj(x)) - else: - x = swiglu(self.gate_proj(x), self.up_proj(x)) + x = swiglu(self.gate_up_fused_proj(x)) out = self.down_proj(x) return out @@ -371,7 +363,7 @@ def forward(self, hidden_states): # compute gating score if self.using_post_norm_recompute: logits, norm_out = FusedNormGateFunc.apply(hidden_states, self.norm_weight, self.weight, self.norm_eps) - if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate: + if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing: logits = FakeGate.apply( hidden_states, self.weight, @@ -381,7 +373,10 @@ def forward(self, hidden_states): else: with paddle.amp.auto_cast(False): hidden_states = hidden_states.cast(self.weight.dtype) - if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate: + if ( + hasattr(self.config, "moe_router_force_load_balancing") + and self.config.moe_router_force_load_balancing + ): logits = FakeGate.apply( hidden_states, self.weight, @@ -474,7 +469,7 @@ def __init__(self, config: DeepseekV2FastConfig, norm_weight=None, norm_eps=None p.expert = not self.is_mp_moe logger.info(f"expert no-sync={p.no_sync}-{p.name}") - self.alpha = config.aux_loss_alpha + self.alpha = config.router_aux_loss_coef if config.n_shared_experts is not None: intermediate_size = config.moe_intermediate_size * config.n_shared_experts if self.using_post_norm_recompute: @@ -1659,8 +1654,7 @@ def forward( attention_mask = self._prepare_decoder_attention_mask( attention_mask, (batch_size, seq_length), past_key_values_length, inputs_embeds.dtype ) # [bs, 1, seq_len, seq_len] - if self.config.use_flash_attention: - attention_mask = None if is_casual_mask(attention_mask) else attention_mask + attention_mask = None if is_casual_mask(attention_mask) else attention_mask if self.config.num_nextn_predict_layers > 0: inputs_embeds_extra = inputs_embeds[:, -self.config.num_nextn_predict_layers :, :] # [B, S, D] @@ -1983,7 +1977,7 @@ def __init__(self, config: DeepseekV2FastConfig, hidden_size=None, eps=1e-6, use mark_as_sequence_parallel_parameter(self.weight) def forward(self, hidden_states): - if self.config.fuse_rms_norm: + if True: return RmsNormFunction.apply(hidden_states, self.weight, self.variance_epsilon) with paddle.amp.auto_cast(False): diff --git a/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py b/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py index 45e41aa0215..a9d69077e70 100644 --- a/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py +++ b/paddleformers/cli/train/deepseek_v3_pretrain/workflow.py @@ -496,9 +496,7 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args): # config.using_flex_token = True # config.num_nextn_predict_layers = 1 - # config.using_fake_gate = True - # config.fuse_rms_norm = True - # config.fuse_attention_ffn = True + # config.moe_router_force_load_balancing = True # config.apply_rope_fusion = True # config.token_drop_steps = 0 model = model_class.from_config(config, dtype=dtype) @@ -556,8 +554,8 @@ def run_dsv3_pretrain(model_args, data_args, generating_args, training_args): callbacks += [MoeExpertsGradScaleCallback(training_args)] if getattr(config, "topk_method", None) == "noaux_tc": - aux_loss_free_gamma = getattr(config, "aux_loss_free_gamma", 0.001) - callbacks += [MoECorrectionBiasAdjustCallback(aux_loss_free_gamma)] + moe_router_bias_update_rate = getattr(config, "moe_router_bias_update_rate", 0.001) + callbacks += [MoECorrectionBiasAdjustCallback(moe_router_bias_update_rate)] def resume_from_custom_func(model): if training_args.resume_from_huggingface_ckpt: diff --git a/paddleformers/cli/train/dpo/dpo_argument.py b/paddleformers/cli/train/dpo/dpo_argument.py index 71bc3b42458..a02ad6230ef 100644 --- a/paddleformers/cli/train/dpo/dpo_argument.py +++ b/paddleformers/cli/train/dpo/dpo_argument.py @@ -155,4 +155,4 @@ class DPOModelArgument: use_quick_lora: bool = field(default=True, metadata={"help": "quick lora"}) # Attention - attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"}) + _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"}) diff --git a/paddleformers/cli/train/dpo/workflow.py b/paddleformers/cli/train/dpo/workflow.py index 64c1002c26a..86ba1aabca4 100644 --- a/paddleformers/cli/train/dpo/workflow.py +++ b/paddleformers/cli/train/dpo/workflow.py @@ -71,8 +71,10 @@ def run_dpo( set_seed(training_args.seed) avaible_attn_impl = AttentionInterface._global_mapping.keys() - if model_args.attn_impl not in avaible_attn_impl: - raise ValueError(f"Invalid attn_impl: {model_args.attn_impl}, available attn_impl: {avaible_attn_impl}") + if model_args._attn_implementation not in avaible_attn_impl: + raise ValueError( + f"Invalid _attn_implementation: {model_args._attn_implementation}, available _attn_implementation: {avaible_attn_impl}" + ) if training_args.loss_type == "orpo": training_args.reference_free = True @@ -148,7 +150,7 @@ def run_dpo( model_args.model_name_or_path, dtype=dtype, ) - model_config._attn_implementation = model_args.attn_impl + model_config._attn_implementation = model_args._attn_implementation model_config.pp_seg_method = model_args.pp_seg_method model_config.max_sequence_length = data_args.max_seq_len model_config.seq_length = data_args.max_seq_len @@ -164,7 +166,7 @@ def run_dpo( ref_model_config.pp_seg_method = model_args.pp_seg_method ref_model_config.max_sequence_length = data_args.max_seq_len ref_model_config.seq_length = data_args.max_seq_len - ref_model_config._attn_implementation = model_args.attn_impl + ref_model_config._attn_implementation = model_args._attn_implementation LlmMetaConfig.set_llm_config(ref_model_config, training_args) diff --git a/paddleformers/cli/train/ernie_pretrain/model_config.py b/paddleformers/cli/train/ernie_pretrain/model_config.py index 6548a179ba9..48454181fdc 100644 --- a/paddleformers/cli/train/ernie_pretrain/model_config.py +++ b/paddleformers/cli/train/ernie_pretrain/model_config.py @@ -103,7 +103,7 @@ class ModelConfig: neftune: bool = field(default=False, metadata={"help": "Whether to apply NEFT"}) neftune_noise_alpha: float = field(default=5.0, metadata={"help": "NEFT noise alpha"}) flash_mask: bool = field(default=False, metadata={"help": "Whether to use flash_mask in flash attention."}) - attn_impl: str = field(default="flashmask", metadata={"help": "Attention implementation"}) + _attn_implementation: str = field(default="flashmask", metadata={"help": "Attention implementation"}) # long sequence strategy use_long_sequence_strategies: bool = field( diff --git a/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py b/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py index 31460565972..a97bf124cdc 100644 --- a/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py +++ b/paddleformers/cli/train/ernie_pretrain/models/ernie/configuration.py @@ -88,7 +88,7 @@ def __init__( use_recompute_attn=False, recompute_use_reentrant=False, use_rmsnorm=True, - fuse_rms_norm=False, + fuse_rms_norm=True, fuse_ln=False, pad_token_id=0, bos_token_id=1, diff --git a/paddleformers/cli/train/sft/workflow.py b/paddleformers/cli/train/sft/workflow.py index b3e8e1d2599..73342c2bb28 100644 --- a/paddleformers/cli/train/sft/workflow.py +++ b/paddleformers/cli/train/sft/workflow.py @@ -252,20 +252,22 @@ def run_sft( model_config.ignore_index = -100 avaible_attn_impl = AttentionInterface._global_mapping.keys() - if model_args.attn_impl not in avaible_attn_impl: - raise ValueError(f"Invalid attn_impl: {model_args.attn_impl}, available attn_impl: {avaible_attn_impl}") + if model_args._attn_implementation not in avaible_attn_impl: + raise ValueError( + f"Invalid _attn_implementation: {model_args._attn_implementation}, available _attn_implementation: {avaible_attn_impl}" + ) model_config.pp_seg_method = model_args.pp_seg_method model_config.seq_length = data_args.max_seq_len model_config.max_sequence_length = data_args.max_seq_len - model_config._attn_implementation = model_args.attn_impl + model_config._attn_implementation = model_args._attn_implementation model_config.is_lora = model_args.lora # Sync arguments to MLLM sub_config if getattr(model_config, "text_config", None) is not None: model_config.text_config.max_sequence_length = data_args.max_seq_len if getattr(model_config, "vision_config", None) is not None: - model_config.vision_config._attn_implementation = model_args.attn_impl + model_config.vision_config._attn_implementation = model_args._attn_implementation model_config.vision_config.recompute_granularity = model_config.recompute_granularity model_config.vision_config.recompute_method = model_config.recompute_method model_config.vision_config.recompute_num_layers = model_config.recompute_num_layers diff --git a/paddleformers/nn/moe_deepep/modular_moe_layer.py b/paddleformers/nn/moe_deepep/modular_moe_layer.py index ce463166f25..156559e9d35 100644 --- a/paddleformers/nn/moe_deepep/modular_moe_layer.py +++ b/paddleformers/nn/moe_deepep/modular_moe_layer.py @@ -69,12 +69,12 @@ def __init__( self.sequence_parallel = pretrained_config.get("sequence_parallel", False) self.tensor_model_parallel_size = pretrained_config.get("tensor_model_parallel_size", 1) self.seq_length = pretrained_config.get("seq_length", pretrained_config.get("max_seq_len", 1024)) - self.fuse_up_gate = pretrained_config.get("fuse_attention_ffn", False) - self.ep_communication_type = pretrained_config.get("ep_communication_type", "deepep") + self.fuse_up_gate = True + self.moe_token_dispatcher_type = pretrained_config.get("moe_token_dispatcher_type", "deepep") self.n_group = pretrained_config.get("n_group", 1) self.topk_group = pretrained_config.get("topk_group", 1) self.routed_scaling_factor = pretrained_config.get("routed_scaling_factor", 1.0) - self.aux_loss_alpha = pretrained_config.get("aux_loss_alpha", 0.0) + self.router_aux_loss_coef = pretrained_config.get("router_aux_loss_coef", 0.0) self.moe_subbatch_token_num_before_dispatch = pretrained_config.get( "moe_subbatch_token_num_before_dispatch", -1 ) @@ -167,13 +167,13 @@ def __init__( self.shared_expert = self.expert_class(**shared_expert_args) self.shared_expert_gate = GeneralLinear.create(self.hidden_size, 1, has_bias=False, linear_type="default") - if self.ep_communication_type == "deepep": + if self.moe_token_dispatcher_type == "deepep": self.communication = DeepEPMoECommunication() - elif self.ep_communication_type == "alltoall": + elif self.moe_token_dispatcher_type == "alltoall": self.communication = AllToAllMoECommunication() else: raise ValueError( - f"Unsupported communication type: {self.ep_communication_type}, please choose from ['deepep', 'alltoall']" + f"Unsupported communication type: {self.moe_token_dispatcher_type}, please choose from ['deepep', 'alltoall']" ) if hasattr(dist, "fleet") and dist.is_initialized() and self.expert_model_parallel_size > 1: @@ -264,8 +264,8 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: reshaped_input = hidden_states output = self._forward_traditional_moe(reshaped_input, topk_indices, topk_weights) - if self.training and self.aux_loss_alpha > 0.0: - aux_loss = aux_loss * self.aux_loss_alpha + if self.training and self.router_aux_loss_coef > 0.0: + aux_loss = aux_loss * self.router_aux_loss_coef output = AddAuxiliaryLoss.apply(output, aux_loss) if self.shared_experts is not None: diff --git a/paddleformers/nn/norm.py b/paddleformers/nn/norm.py index 27606c5fa6f..72452534908 100644 --- a/paddleformers/nn/norm.py +++ b/paddleformers/nn/norm.py @@ -17,11 +17,14 @@ from paddle.distributed.fleet.utils.sequence_parallel_utils import ( mark_as_sequence_parallel_parameter, ) -from paddle.incubate.nn.functional import fused_rms_norm_ext +# from ..cli.utils.process import detect_device from ..generation.configuration_utils import PretrainedConfig from .general import GeneralInterface +# from paddle.incubate.nn.functional import fused_rms_norm_ext + + __all__ = ["Norm"] @@ -65,8 +68,9 @@ def __init__(self, config: PretrainedConfig, hidden_size=None, norm_eps=None, in self.enable_sequence_parallel() def forward(self, hidden_states): - if self.config.get("fuse_rms_norm", False): - return fused_rms_norm_ext(hidden_states, self.weight, self.variance_epsilon)[0].astype(self.weight.dtype) + # current_device = detect_device() + # if self.config.get("fuse_rms_norm", False) and current_device != "iluvatar_gpu": + # return fused_rms_norm_ext(hidden_states, self.weight, self.variance_epsilon)[0].astype(self.weight.dtype) if paddle.in_dynamic_mode(): with paddle.amp.auto_cast(False): diff --git a/paddleformers/nn/pp_model.py b/paddleformers/nn/pp_model.py index fd7d8a12057..ad8d4a09a27 100644 --- a/paddleformers/nn/pp_model.py +++ b/paddleformers/nn/pp_model.py @@ -122,7 +122,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1): config (Config): Model configuration object containing: - num_hidden_layers (int): Total number of transformer layers - virtual_pipeline_model_parallel_size (int): Virtual pipeline parallelism degree - - add_tail_layers (int): Additional tail layers to append + - num_empty_layers_add_in_tail (int): Additional tail layers to append skip_recompute_num (int): Number of layers per virtual pipeline stage to exclude from recomputation. Defaults to -1 (auto-configure). Returns: @@ -139,7 +139,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1): assert pp_size > 1, ( "Only support pipeline parallel, " f"pp_size must be greater than 1, but got pp_size: {pp_size}" ) - layer_num = config.num_hidden_layers + config.add_tail_layers + layer_num = config.num_hidden_layers + config.num_empty_layers_add_in_tail if skip_recompute_num == -1: # select all layers to skip recompute @@ -614,7 +614,7 @@ def __init__(self, config: PretrainedConfig, **kwargs): LayerDesc(MTPLayerPipeCls, config=config, layer_idx=config.num_hidden_layers + i), f"model.layers.{config.num_hidden_layers + i}", ) - for i in range(config.add_tail_layers): + for i in range(config.num_empty_layers_add_in_tail): self.add_sequential_layer( LayerDesc( EmptyLayer, @@ -651,7 +651,9 @@ def __init__(self, config: PretrainedConfig, **kwargs): if ( seg_method == "layer:DecoderLayer|EmptyLayer" - and (config.num_hidden_layers + config.add_tail_layers) % get_hcg().topology().get_dim_size("pipe") != 0 + and (config.num_hidden_layers + config.num_empty_layers_add_in_tail) + % get_hcg().topology().get_dim_size("pipe") + != 0 ): seg_method = "uniform" logger.info(f"using recompute_interval={recompute_interval}, seg_method={seg_method}") diff --git a/paddleformers/trainer/training_args.py b/paddleformers/trainer/training_args.py index 595283d90bd..b5cd819dab0 100644 --- a/paddleformers/trainer/training_args.py +++ b/paddleformers/trainer/training_args.py @@ -1092,7 +1092,7 @@ class TrainingArguments: default=False, metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"}, ) - aux_loss_alpha: Optional[float] = field( + router_aux_loss_coef: Optional[float] = field( default=0.0001, metadata={"help": "MoE (Mixture of Experts) Auxiliary loss weight coefficient"}, ) diff --git a/paddleformers/transformers/attention_utils.py b/paddleformers/transformers/attention_utils.py index cf8ced7b334..6d79f9a08ae 100755 --- a/paddleformers/transformers/attention_utils.py +++ b/paddleformers/transformers/attention_utils.py @@ -553,7 +553,7 @@ def __init__( self.v_proj = Linear3D(embed_dim, num_heads, self.head_dim, weight_attr, bias_attr=bias_attr) self.out_proj = nn.Linear(embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) - self.attn_impl = AttentionRegistry.cls_dict[attention_type]( + self._attn_implementation = AttentionRegistry.cls_dict[attention_type]( num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed ) @@ -603,7 +603,9 @@ def forward( else: q, k, v, cache = self._prepare_qkv(query, key, value, cache) - out = self.attn_impl(q, k, v, self.head_dim, attn_mask, rand_mask_idx, query_mask, key_mask, self.dropout) + out = self._attn_implementation( + q, k, v, self.head_dim, attn_mask, rand_mask_idx, query_mask, key_mask, self.dropout + ) # combine heads out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py index 50a19c566e2..b83b3c40393 100644 --- a/paddleformers/transformers/configuration_utils.py +++ b/paddleformers/transformers/configuration_utils.py @@ -229,13 +229,9 @@ def llmmetaclass(cls): class LlmMetaConfig: op_fusion_attributes = [ # name, type, default_value, comment - ("use_flash_attention", bool, False, "Only used in `ernie45_vl` and `deepseek_v3_pretrain`."), - ("fuse_rms_norm", bool, False, "Whether to fuse RMSNorm for efficiency"), ("use_fused_linear_cross_entropy", bool, False, "use fused `linear + cross_entropy` fuse op."), ("apply_rope_fusion", bool, False, "Whether to fuse RoPE operation"), ("fuse_swiglu", bool, False, "Whether to fuse SwiGLU operations"), - ("fuse_attention_qkv", bool, False, "Whether to fuse Attention QKV operations"), - ("fuse_attention_ffn", bool, False, "Whether to fuse Attention FFN operations"), ] hybrid_parallel_attributes = [ @@ -254,7 +250,7 @@ class LlmMetaConfig: ("context_parallel_size", int, 1, "context_parallel_size"), # pp refine recompute ("no_recompute_layers", Optional[List[int]], None, "no_recompute_layers"), - ("add_tail_layers", int, 0, "Additional layers to append at the end"), + ("num_empty_layers_add_in_tail", int, 0, "Additional layers to append at the end"), # sep_parallel ("sep_parallel_size", int, 1, "sep_parallel_size"), ("context_parallel_size", int, 1, "context_parallel_size"), @@ -302,8 +298,8 @@ class LlmMetaConfig: 0, "The number of tokens in each subbatch for MoE model processing.", ), - ("using_fake_gate", bool, False, "Whether to fake gate."), - ("ep_communication_type", str, "deepep", 'Communication type used by MoE module "deepep" or "alltoall". '), + ("moe_router_force_load_balancing", bool, False, "Whether to fake gate."), + ("moe_token_dispatcher_type", str, "deepep", 'Communication type used by MoE module "deepep" or "alltoall". '), ("use_unified_moe", bool, False, "Whether to use unified moe."), ( "moe_deepep_num_sms", @@ -748,7 +744,7 @@ class PretrainedConfig: `"single_label_classification"` or `"multi_label_classification"`. moe_subbatch_token_num_before_dispatch (`int`, *optional*, defaults to 0): The number of tokens in a subbatch for MoE. - ep_communication_type (`str`, *optional*, defaults to `deepep`): + moe_token_dispatcher_type (`str`, *optional*, defaults to `deepep`): Communication type for expert parallel. Can be one of `deepep`, `alltoall`. use_unified_moe (`bool`, *optional*, defaults to `False`): Whether to use unified MoE. @@ -829,8 +825,6 @@ def __init__(self, **kwargs): llm_meta = LlmMetaConfig._get_init() self._unsavable_keys.update(LlmMetaConfig._get_unsavable_keys()) self._unsavable_keys.remove("tensor_model_parallel_size") - self._unsavable_keys.remove("fuse_attention_qkv") - self._unsavable_keys.remove("fuse_attention_ffn") self._unsavable_keys.add("_attn_implementation") kwargs = set_expected_keys(self, llm_meta, kwargs) @@ -853,10 +847,6 @@ def __init__(self, **kwargs): self.sep_parallel_size = 1 self.context_parallel_size = 1 - # for transformers fuse - self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False) - self.fuse_attention_ffn = kwargs.pop("fuse_attention_ffn", False) - # for general components self._attn_implementation = kwargs.pop("_attn_implementation", "eager") @@ -905,9 +895,9 @@ def __init__(self, **kwargs): self.dpo_config = kwargs.pop("dpo_config", None) self.kto_config = kwargs.pop("kto_config", None) - self.ep_communication_type = kwargs.pop("ep_communication_type", "deepep") + self.moe_token_dispatcher_type = kwargs.pop("moe_token_dispatcher_type", "deepep") self.use_unified_moe = kwargs.pop("use_unified_moe", False) - self.using_fake_gate = kwargs.pop("using_fake_gate", False) + self.moe_router_force_load_balancing = kwargs.pop("moe_router_force_load_balancing", False) # Tokenizer arguments TODO: eventually tokenizer and models should share the same config self.tokenizer_class = kwargs.pop("tokenizer_class", None) diff --git a/paddleformers/transformers/deepseek_v3/configuration.py b/paddleformers/transformers/deepseek_v3/configuration.py index d61c7c01782..fd0bfd0ff11 100644 --- a/paddleformers/transformers/deepseek_v3/configuration.py +++ b/paddleformers/transformers/deepseek_v3/configuration.py @@ -70,7 +70,7 @@ class DeepseekV3Config(PretrainedConfig): Whether to normalize the weights of the routed experts. scoring_func (`str`, *optional*, defaults to 'softmax'): Method of computing expert weights. - aux_loss_alpha (`float`, *optional*, defaults to 0.001): + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): Auxiliary loss weight coefficient. seq_aux = (`bool`, *optional*, defaults to True): Whether to compute the auxiliary loss for each individual sample. @@ -161,7 +161,7 @@ def __init__( first_k_dense_replace=0, norm_topk_prob=False, scoring_func="softmax", - aux_loss_alpha=0.0001, + router_aux_loss_coef=0.0001, seq_aux=True, hidden_act="silu", max_position_embeddings=2048, @@ -208,7 +208,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha + self.router_aux_loss_coef = router_aux_loss_coef self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: diff --git a/paddleformers/transformers/deepseek_v3/modeling.py b/paddleformers/transformers/deepseek_v3/modeling.py index cb4f68fdb30..8d5d3274731 100644 --- a/paddleformers/transformers/deepseek_v3/modeling.py +++ b/paddleformers/transformers/deepseek_v3/modeling.py @@ -300,7 +300,7 @@ def forward(self, hidden_states): with paddle.amp.auto_cast(False): hidden_states = hidden_states.cast(self.weight.dtype) - if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate: + if hasattr(self.config, "moe_router_force_load_balancing") and self.config.moe_router_force_load_balancing: logits = FakeGate.apply(hidden_states, self.weight) else: logits = F.linear(hidden_states, self.weight, None) @@ -488,7 +488,7 @@ def __init__(self, config: DeepseekV3Config): if self.is_mp_moe or self.is_ep_moe: p.is_distributed = True - self.alpha = config.aux_loss_alpha + self.alpha = config.router_aux_loss_coef if config.n_shared_experts is not None: intermediate_size = config.moe_intermediate_size * config.n_shared_experts self.shared_experts = DeepseekV3MLP(config=config, intermediate_size=intermediate_size) @@ -1129,7 +1129,6 @@ def _gen_aoa_config(cls, config: DeepseekV3Config): f"model.layers.$LAYER_ID.mlp.shared_experts.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.down_proj.weight", ] } - if config.q_lora_rank: aoa_config["aoa_statements"] += [ f"model.layers.$LAYER_ID.self_attn.q_{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.q_{x}_proj.weight" @@ -1138,13 +1137,11 @@ def _gen_aoa_config(cls, config: DeepseekV3Config): aoa_config["aoa_statements"] += [ f"model.layers.$LAYER_ID.self_attn.q_a_layernorm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.q_a_layernorm.weight" ] - aoa_config["aoa_statements"] += [ f"model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight", f"model.layers.$LAYER_ID.self_attn.kv_b_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.kv_b_proj.weight", f"model.layers.$LAYER_ID.self_attn.kv_a_layernorm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.kv_a_layernorm.weight", ] - if config.attention_bias: aoa_config["aoa_statements"] += [ f"model.layers.$LAYER_ID.self_attn.q_a_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.q_a_proj.bias", @@ -1152,43 +1149,30 @@ def _gen_aoa_config(cls, config: DeepseekV3Config): ] # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" + for x in ("q", "k", "v") + ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" + for x in ("q", "k", "v") + ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += ( - [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - + [ - f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight" - for p in ("gate", "up") - ] - + [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight" - for p in ("gate", "up") - ] - ) - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn", - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", + aoa_config["aoa_statements"] += ( + [ + f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" + for p in ("gate", "up") + ] + + [ + f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight" + for p in ("gate", "up") ] + + [ + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight" + for p in ("gate", "up") + ] + ) return aoa_config @@ -1210,7 +1194,6 @@ def _gen_inv_aoa_config(cls, config: DeepseekV3Config): f"{model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight -> model.layers.$LAYER_ID.post_attention_layernorm.weight", f"{model_prefix}layers.$LAYER_ID.mlp.gate.e_score_correction_bias -> model.layers.$LAYER_ID.mlp.gate.e_score_correction_bias", ] - if config.q_lora_rank: aoa_statements += [ f"{model_prefix}layers.$LAYER_ID.self_attn.q_{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.q_{x}_proj.weight" @@ -1219,82 +1202,41 @@ def _gen_inv_aoa_config(cls, config: DeepseekV3Config): aoa_statements += [ f"{model_prefix}layers.$LAYER_ID.self_attn.q_a_layernorm.weight -> model.layers.$LAYER_ID.self_attn.q_a_layernorm.weight" ] - aoa_statements += [ f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight^T -> model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.weight", f"{model_prefix}layers.$LAYER_ID.self_attn.kv_b_proj.weight^T -> model.layers.$LAYER_ID.self_attn.kv_b_proj.weight", f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_layernorm.weight -> model.layers.$LAYER_ID.self_attn.kv_a_layernorm.weight", ] - if config.attention_bias: aoa_statements += [ f"{model_prefix}layers.$LAYER_ID.self_attn.q_a_proj.bias -> model.layers.$LAYER_ID.self_attn.q_a_proj.bias", f"{model_prefix}layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.bias -> model.layers.$LAYER_ID.self_attn.kv_a_proj_with_mqa.bias", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" + for x in ("q", "k", "v") + ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" + for x in ("q", "k", "v") + ] + + aoa_statements += ( + [ + f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" + for y in ("gate", "up") ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0", + + [ + f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight" + for y in ("gate", "up") ] - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.num_hidden_layers) - for x in ("q", "k", "v") + + [ + f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight" + for y in ("gate", "up") ] + ) - if not config.fuse_attention_ffn: - aoa_statements += ( - [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") - ] - + [ - f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight" - for y in ("gate", "up") - ] - + [ - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight" - for y in ("gate", "up") - ] - ) - else: - aoa_statements += [ - f"{model_prefix}layers.0.mlp.up_gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn", - f"{model_prefix}layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", - f"{model_prefix}layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", - f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn", - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn", - ] - aoa_statements += ( - [ - f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" - for layer_id in range(1, config.num_hidden_layers) - ] - + [ - f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" - for layer_id in range(1, config.num_hidden_layers) - ] - + [ - f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight" - for layer_id in range(1, config.num_hidden_layers) - for expert_id in range(config.n_routed_experts) - ] - + [ - f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight" - for layer_id in range(1, config.num_hidden_layers) - for expert_id in range(config.n_routed_experts) - ] - ) aoa_config = {"aoa_statements": aoa_statements} return aoa_config diff --git a/paddleformers/transformers/ernie4_5/modeling.py b/paddleformers/transformers/ernie4_5/modeling.py index e9fda7b09d8..a4d3804b5cf 100644 --- a/paddleformers/transformers/ernie4_5/modeling.py +++ b/paddleformers/transformers/ernie4_5/modeling.py @@ -197,7 +197,6 @@ def __init__(self, config, layer_idx=0): self.num_key_value_heads = config.num_key_value_heads self.head_dim = config.head_dim self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -218,36 +217,13 @@ def __init__(self, config, layer_idx=0): kv_hidden_size = self.head_dim * config.num_key_value_heads q_hidden_size = self.head_dim * config.num_attention_heads - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - self.hidden_size, - q_hidden_size, - has_bias=config.use_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - self.hidden_size, - kv_hidden_size, - has_bias=config.use_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - self.hidden_size, - kv_hidden_size, - has_bias=config.use_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - self.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.use_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + self.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.use_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, @@ -288,39 +264,27 @@ def forward( - attention_weights: Optional attention probabilities - updated_key_value_cache: Optional updated cache """ - if not self.fuse_attention_qkv: - if self.config.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - - query_states = self.q_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim]) - key_states = self.k_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim]) - value_states = self.v_proj(hidden_states).reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.config.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.config.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) # b l h d -> b h l d query_states = query_states.transpose(1, 2) @@ -377,7 +341,7 @@ def __init__(self, config, layer_idx): self.layer_idx = layer_idx self.config = config self.self_attn = Ernie4_5Attention(config, layer_idx) - self.mlp = Ernie4_5MLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Ernie4_5MLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", @@ -492,30 +456,18 @@ def _gen_aoa_config(cls, config: Ernie4_5Config): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.use_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.use_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] # lm_head if config.tie_word_embeddings: @@ -535,39 +487,27 @@ def _gen_inv_aoa_config(cls, config: Ernie4_5Config): f"{model_prefix}norm.weight -> model.norm.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - ] - for layer_id in range(config.num_hidden_layers): - for x in ("q", "k", "v"): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - ] - if config.use_bias: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for layer_id in range(config.num_hidden_layers): + for x in ("q", "k", "v"): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" ] - - if not config.fuse_attention_ffn: + if config.use_bias: aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - else: + + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + ] + for layer_id in range(config.num_hidden_layers): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", + f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", ] - for layer_id in range(config.num_hidden_layers): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", - f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", - ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] diff --git a/paddleformers/transformers/ernie4_5_moe/modeling.py b/paddleformers/transformers/ernie4_5_moe/modeling.py index 2eaab83e850..faf2cf2c0ca 100644 --- a/paddleformers/transformers/ernie4_5_moe/modeling.py +++ b/paddleformers/transformers/ernie4_5_moe/modeling.py @@ -290,7 +290,7 @@ def __init__(self, config, layer_idx): config.hidden_size, config.moe_intermediate_size, layer_idx, - fuse_up_gate=config.fuse_attention_ffn, + fuse_up_gate=True, ) ) else: @@ -308,7 +308,7 @@ def __init__(self, config, layer_idx): deepcopy(config), config.hidden_size, config.moe_intermediate_size * config.moe_num_shared_experts, - fuse_up_gate=config.fuse_attention_ffn, + fuse_up_gate=True, ) use_expert_out_alltoall = use_expert_out_alltoall = "alltoall" in config.moe_multimodal_dispatch_use_allgather use_padding = "unpad" not in config.moe_multimodal_dispatch_use_allgather @@ -367,7 +367,7 @@ def __init__(self, config, layer_idx): config, hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, - fuse_up_gate=config.fuse_attention_ffn, + fuse_up_gate=True, ) if config.sequence_parallel and isinstance( @@ -544,53 +544,23 @@ def _gen_aoa_config(cls, config: Ernie4_5_MoeConfig): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.mtp_block.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.mtp_block.$LAYER_ID.self_attn.q_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.use_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", - f"model.mtp_block.$LAYER_ID.self_attn.q_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight^T, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + f"model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.use_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - f"model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += ( - [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - + [ - f"model.layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.{p}_proj.weight" - for p in ("gate", "up") - ] - + [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight" - for p in ("gate", "up") - ] - + [ - f"model.mtp_block.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - ) - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn", - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", - f"model.mtp_block.$LAYER_ID.mlp.gate_proj.weight^T, model.mtp_block.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + f"model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight, fused_ffn", + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", + f"model.mtp_block.$LAYER_ID.mlp.gate_proj.weight^T, model.mtp_block.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] if config.tie_word_embeddings: aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"] @@ -620,83 +590,53 @@ def _gen_inv_aoa_config(cls, config: Ernie4_5_MoeConfig): f"{model_prefix}mtp_linear_proj.$LAYER_ID.weight^T -> model.mtp_linear_proj.$LAYER_ID.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_statements += [ - f"{model_prefix}mtp_block.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.mtp_block.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight -> model.mtp_block.$LAYER_ID.self_attn.q_proj.weight, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - ] - for x in ("q", "k", "v"): - for layer_id in range(config.num_hidden_layers): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight", - ] - for layer_id in range(config.num_nextn_predict_layers): - aoa_statements += [ - f"model.mtp_block.{layer_id}.self_attn.{x}_proj.weight^T -> model.mtp_block.{layer_id}.self_attn.{x}_proj.weight", - ] - if config.use_bias: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.weight -> model.mtp_block.$LAYER_ID.self_attn.q_proj.weight, model.mtp_block.$LAYER_ID.self_attn.k_proj.weight, model.mtp_block.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for x in ("q", "k", "v"): + for layer_id in range(config.num_hidden_layers): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias -> model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] - - if not config.fuse_attention_ffn: - aoa_statements += ( - [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") - ] - + [ - f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_experts.{y}_proj.weight" - for y in ("gate", "up") + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight", ] - + [ - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight" - for y in ("gate", "up") - ] - + [ - f"{model_prefix}mtp_block.$LAYER_ID.mlp.{y}_proj.weight^T -> model.mtp_block.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") + for layer_id in range(config.num_nextn_predict_layers): + aoa_statements += [ + f"model.mtp_block.{layer_id}.self_attn.{x}_proj.weight^T -> model.mtp_block.{layer_id}.self_attn.{x}_proj.weight", ] - ) - else: + if config.use_bias: aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", - f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn", - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn", - f"{model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight -> model.mtp_block.$LAYER_ID.mlp.gate_proj.weight, model.mtp_block.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + f"{model_prefix}mtp_block.$LAYER_ID.self_attn.qkv_proj.bias -> model.mtp_block.$LAYER_ID.self_attn.q_proj.bias, model.mtp_block.$LAYER_ID.self_attn.k_proj.bias, model.mtp_block.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - # mlp - for layer_id in range(config.moe_layer_start_index): - for y in ("gate", "up"): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.{y}_proj.weight", - ] - # experts - for layer_id in range(config.moe_layer_start_index, config.num_hidden_layers): - for y in ("gate", "up"): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight" - ] - for expert_id in range(config.moe_num_experts): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight" - ] - # mtp - for layer_id in range(config.num_nextn_predict_layers): - for y in ("gate", "up"): + + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + f"{model_prefix}layers.$LAYER_ID.mlp.shared_experts.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_experts.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_experts.up_proj.weight, fused_ffn", + f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn", + f"{model_prefix}mtp_block.$LAYER_ID.mlp.up_gate_proj.weight -> model.mtp_block.$LAYER_ID.mlp.gate_proj.weight, model.mtp_block.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + ] + # mlp + for layer_id in range(config.moe_layer_start_index): + for y in ("gate", "up"): + aoa_statements += [ + f"model.layers.{layer_id}.mlp.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.{y}_proj.weight", + ] + # experts + for layer_id in range(config.moe_layer_start_index, config.num_hidden_layers): + for y in ("gate", "up"): + aoa_statements += [ + f"model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.shared_experts.{y}_proj.weight" + ] + for expert_id in range(config.moe_num_experts): aoa_statements += [ - f"model.mtp_block.{layer_id}.mlp.{y}_proj.weight^T -> model.mtp_block.{layer_id}.mlp.{y}_proj.weight" + f"model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.{y}_proj.weight" ] + # mtp + for layer_id in range(config.num_nextn_predict_layers): + for y in ("gate", "up"): + aoa_statements += [ + f"model.mtp_block.{layer_id}.mlp.{y}_proj.weight^T -> model.mtp_block.{layer_id}.mlp.{y}_proj.weight" + ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py b/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py index 984fdd9ad42..fa2a3fd33ee 100644 --- a/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py +++ b/paddleformers/transformers/ernie4_5_moe_vl/model/configuration.py @@ -44,7 +44,6 @@ "pad_token_id": 0, "use_cache": False, "recompute": False, - "use_flash_attention": True, "use_pure_fp16": False, }, } @@ -75,12 +74,11 @@ def __init__( initializer_range=0.02, # no use rms_norm_eps=1e-6, use_cache=False, - use_flash_attention=True, use_sparse_flash_attn=True, use_var_len_flash_attn=False, recompute_use_reentrant=False, use_rmsnorm=True, - fuse_rms_norm=False, + fuse_rms_norm=True, fuse_ln=False, pad_token_id=0, bos_token_id=1, @@ -93,7 +91,7 @@ def __init__( weight_share_add_bias=True, max_sequence_length=None, ignored_index=-100, - add_tail_layers=False, + num_empty_layers_add_in_tail=False, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0, compression_ratio: float = 1.0, @@ -120,7 +118,6 @@ def __init__( num_attention_heads (int): Number of attention heads for each attention layer rms_norm_eps (float): The epsilon used by the RMS normalization layers use_cache (bool): Whether to use caching for faster generation (decoding) - use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation use_sparse_flash_attn (bool): Whether to use sparse FlashAttention use_var_len_flash_attn (bool): Whether to use variable-length FlashAttention recompute_use_reentrant (bool): Whether to use reentrant checkpointing @@ -137,7 +134,7 @@ def __init__( weight_share_add_bias (bool): Whether to share bias weights in certain layers max_sequence_length (int): Maximum sequence length for positional embeddings ignored_index (int): Target value that is ignored during loss computation - add_tail_layers (int): Whether to add additional layers at the end + num_empty_layers_add_in_tail (int): Whether to add additional layers at the end attention_probs_dropout_prob (float): Dropout probability for attention weights hidden_dropout_prob (float): Dropout probability for hidden layers compression_ratio (float): Ratio for KV cache compression (1.0 = no compression) @@ -172,7 +169,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.use_flash_attention = use_flash_attention self.use_sparse_flash_attn = use_sparse_flash_attn self.recompute_use_reentrant = recompute_use_reentrant self.use_var_len_flash_attn = use_var_len_flash_attn @@ -193,7 +189,7 @@ def __init__( self.fuse_softmax_mask = fuse_softmax_mask self.ignored_index = ignored_index - self.add_tail_layers = add_tail_layers + self.num_empty_layers_add_in_tail = num_empty_layers_add_in_tail self.skip_recompute_ops = dict() self.attention_probs_dropout_prob = attention_probs_dropout_prob diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py index b76101bf5f8..190d3894023 100644 --- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py +++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling.py @@ -748,10 +748,7 @@ def set_attn_func(self): Selects between flash/core attention. """ config = self.config - if config.use_flash_attention: - self.attn_func = self._flash_attention_wrapper - else: - self.attn_func = self.core_attn + self.attn_func = self._flash_attention_wrapper if config.cachekv_quant: from paddleslim.common.wrapper_function import FuncWrapper diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py index 8948ea0f545..f2753298828 100644 --- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py +++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe.py @@ -1542,9 +1542,7 @@ def get_decoder(self): def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id): """Avoid using attention_mask with flash_attn on generation.""" - if self.config.use_flash_attention: - return None - return super().prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id) + return None def prepare_inputs_for_generation( self, diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py index 0a3be64306e..70915b44816 100644 --- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py +++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_pp.py @@ -103,7 +103,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1): config (Config): Model configuration object containing: - num_hidden_layers (int): Total number of transformer layers - virtual_pipeline_model_parallel_size (int): Virtual pipeline parallelism degree - - add_tail_layers (int): Additional tail layers to append + - num_empty_layers_add_in_tail (int): Additional tail layers to append skip_recompute_num (int): Number of layers per virtual pipeline stage to exclude from recomputation. Defaults to -1 (auto-configure). @@ -122,7 +122,7 @@ def get_pp_vp_split_layers(config, skip_recompute_num=-1): assert pp_size > 1, ( "Only support pipeline parallel, " f"pp_size must be greater than 1, but got pp_size: {pp_size}" ) - layer_num = config.num_hidden_layers + config.add_tail_layers + layer_num = config.num_hidden_layers + config.num_empty_layers_add_in_tail if skip_recompute_num == -1: # select all layers to skip recompute diff --git a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py index b9a73730054..92eaad1d9db 100644 --- a/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py +++ b/paddleformers/transformers/ernie4_5_moe_vl/model/modeling_moe_vl_pp.py @@ -1226,7 +1226,7 @@ def _need_full_recompute(layer_idx): f"model.layers.{i}", ) - for i in range(config.add_tail_layers): + for i in range(config.num_empty_layers_add_in_tail): self.add_sequential_layer( LayerDesc( EmptyLayer, @@ -1266,7 +1266,9 @@ def _need_full_recompute(layer_idx): pass if ( seg_method == "layer:Ernie4_5_DecoderLayer|ErnieDecoderLayer|EmptyLayer" - and (config.num_hidden_layers + config.add_tail_layers) % get_hcg().topology().get_dim_size("pipe") != 0 + and (config.num_hidden_layers + config.num_empty_layers_add_in_tail) + % get_hcg().topology().get_dim_size("pipe") + != 0 ): seg_method = "uniform" logger.info(f"using recompute_interval={recompute_interval}, seg_method={seg_method}") diff --git a/paddleformers/transformers/gemma3_text/modeling.py b/paddleformers/transformers/gemma3_text/modeling.py index 0510b211d32..ae07f6af4c0 100644 --- a/paddleformers/transformers/gemma3_text/modeling.py +++ b/paddleformers/transformers/gemma3_text/modeling.py @@ -202,7 +202,6 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): self.attention_dropout = config.attention_dropout self.is_causal = not config.use_bidirectional_attention self.attn_implementation = config._attn_implementation - self.fuse_attention_qkv = config.fuse_attention_qkv self.num_heads = config.num_attention_heads self.num_key_value_heads = config.num_key_value_heads @@ -223,36 +222,13 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): kv_hidden_size = config.num_key_value_heads * self.head_dim q_hidden_size = config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, config.hidden_size, @@ -281,40 +257,26 @@ def forward( use_cache: bool = False, attn_mask_startend_row_indices: Optional[paddle.Tensor] = None, ) -> tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[tuple[paddle.Tensor]]]: - if not self.fuse_attention_qkv: - if self.config.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - - hidden_shape = (bsz, q_len, -1, self.head_dim) - - query_states = self.q_proj(hidden_states).reshape(hidden_shape) - key_states = self.k_proj(hidden_states).reshape(hidden_shape) - value_states = self.v_proj(hidden_states).reshape(hidden_shape) + mix_layer = self.qkv_proj(hidden_states) + if self.config.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.config.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - query_states = query_states.reshape([0, 0, -1, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + query_states = query_states.reshape([0, 0, -1, self.head_dim]) query_states = self.q_norm(query_states) key_states = self.k_norm(key_states) @@ -364,7 +326,7 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): self.layer_idx = layer_idx self.attention_type = config.layer_types[layer_idx] self.self_attn = Gemma3Attention(config=config, layer_idx=layer_idx) - self.mlp = Gemma3MLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Gemma3MLP(config, fuse_up_gate=True) self.input_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) self.pre_feedforward_layernorm = Gemma3RMSNorm(self.hidden_size, eps=config.rms_norm_eps) @@ -454,35 +416,18 @@ def _gen_aoa_config(cls, config: Gemma3TextConfig): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.attention_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.attention_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] return aoa_config @@ -507,41 +452,24 @@ def _gen_inv_aoa_config(cls, config: Gemma3TextConfig): f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.attention_bias: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}" - ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + aoa_statements += [ + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" + for layer_id in range(config.num_hidden_layers) + for x in ("q", "k", "v") + ] + if config.attention_bias: aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.num_hidden_layers) - for x in ("q", "k", "v") + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.attention_bias: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.0.mlp.up_gate_proj.weight -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn", - "model.layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", - "model.layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", - ] + aoa_statements += [ + f"{model_prefix}layers.0.mlp.up_gate_proj.weight -> model.layers.0.mlp.gate_proj.weight, model.layers.0.mlp.up_proj.weight, fused_ffn", + "model.layers.0.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", + "model.layers.0.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", + ] aoa_config = {"aoa_statements": aoa_statements} return aoa_config diff --git a/paddleformers/transformers/glm4_moe/configuration.py b/paddleformers/transformers/glm4_moe/configuration.py index 645eca58c6a..9790aa3d065 100644 --- a/paddleformers/transformers/glm4_moe/configuration.py +++ b/paddleformers/transformers/glm4_moe/configuration.py @@ -154,7 +154,7 @@ def __init__( pp_seg_method="layer:Glm4MoeDecoderLayer", disable_ffn_model_parallel=False, scoring_func="sigmoid", - aux_loss_alpha=0.0001, + router_aux_loss_coef=0.0001, seq_aux=True, topk_method="noaux_tc", using_flex_token=True, @@ -200,7 +200,7 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.use_qk_norm = use_qk_norm self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha + self.router_aux_loss_coef = router_aux_loss_coef self.seq_aux = seq_aux self.topk_method = topk_method self.using_flex_token = using_flex_token diff --git a/paddleformers/transformers/glm4_moe/modeling.py b/paddleformers/transformers/glm4_moe/modeling.py index e6b632ae494..e4db9127953 100644 --- a/paddleformers/transformers/glm4_moe/modeling.py +++ b/paddleformers/transformers/glm4_moe/modeling.py @@ -165,7 +165,6 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None): self.tensor_parallel = config.tensor_model_parallel_size > 1 self.sequence_parallel = config.sequence_parallel self.attention_bias = config.attention_bias - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -181,36 +180,13 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - self.hidden_size, - q_hidden_size, - has_bias=self.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - self.hidden_size, - kv_hidden_size, - has_bias=self.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - self.hidden_size, - kv_hidden_size, - has_bias=self.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - self.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=self.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + self.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=self.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, self.hidden_size, @@ -248,42 +224,27 @@ def forward( batch_size: Optional[int] = None, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - if not self.fuse_attention_qkv: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - query_states = query_states.reshape([bsz, q_len, -1, self.head_dim]) - key_states = key_states.reshape([bsz, q_len, -1, self.head_dim]) - value_states = value_states.reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) if self.use_qk_norm: # main diff from Llama query_states = self.q_norm(query_states) @@ -434,9 +395,7 @@ def __init__(self, config): config.sequence_parallel = False self.experts = nn.LayerList( [ - Glm4MoeMLP( - config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn - ) + Glm4MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True) for _ in range(config.n_routed_experts) ] ) @@ -444,7 +403,7 @@ def __init__(self, config): self.shared_experts = Glm4MoeMLP( config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts, - fuse_up_gate=config.fuse_attention_ffn, + fuse_up_gate=True, ) def moe(self, hidden_states: paddle.Tensor, topk_indices: paddle.Tensor, topk_weights: paddle.Tensor): @@ -548,7 +507,7 @@ def __init__(self, config): expert_kwargs={ "config": mlp_config, "intermediate_size": mlp_config.moe_intermediate_size, - "fuse_up_gate": config.fuse_attention_ffn, + "fuse_up_gate": True, }, gate=gate, moe_group=moe_group, @@ -568,13 +527,13 @@ def __init__(self, config): self.shared_experts = Glm4MoeMLP( config=config, intermediate_size=config.moe_intermediate_size * config.n_shared_experts, - fuse_up_gate=config.fuse_attention_ffn, + fuse_up_gate=True, ) def forward(self, hidden_states): final_hidden_states, l_aux, _ = super().forward(hidden_states) - if self.training and self.config.aux_loss_alpha > 0.0: - l_aux = l_aux * self.config.aux_loss_alpha + if self.training and self.config.router_aux_loss_coef > 0.0: + l_aux = l_aux * self.config.router_aux_loss_coef final_hidden_states = AddAuxiliaryLoss.apply(final_hidden_states, l_aux) final_hidden_states = final_hidden_states + self.shared_experts(hidden_states) return final_hidden_states @@ -612,7 +571,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: int): ) ) else: - self.mlp = Glm4MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Glm4MoeMLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, @@ -871,15 +830,9 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig): aoa_config["aoa_statements"] += [ f"model.layers.0.mlp.down_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.down_proj.weight" ] - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.0.mlp.gate_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.gate_proj.weight", - f"model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_proj.weight", - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.0.mlp.gate_proj.weight^T, model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.0.mlp.gate_proj.weight^T, model.layers.0.mlp.up_proj.weight^T -> {model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight, fused_ffn", + ] # layer0 - layer_num_hidden_layers for layer_idx in reversed(range(0, num_hidden_layers)): @@ -892,19 +845,13 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig): f"{prefix}.self_attn.o_proj.weight^T -> {prefix_offset}.self_attn.o_proj.weight", ] # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix_offset}.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"{prefix}.self_attn.q_proj.weight^T, {prefix}.self_attn.k_proj.weight^T, {prefix}.self_attn.v_proj.weight^T -> {prefix_offset}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"{prefix}.self_attn.q_proj.weight^T, {prefix}.self_attn.k_proj.weight^T, {prefix}.self_attn.v_proj.weight^T -> {prefix_offset}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"{prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias -> {prefix_offset}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.attention_bias: - aoa_config["aoa_statements"] += [ - f"{prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias -> {prefix_offset}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # layer1 - layer_num_hidden_layers for layer_idx in reversed(range(1, num_hidden_layers)): layer_idx_offset = layer_idx + num_head_empty_layers @@ -925,33 +872,24 @@ def _gen_aoa_config(cls, config: Glm4MoeConfig): ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"{prefix}.mlp.shared_experts.{p}_proj.weight^T -> {prefix_offset}.mlp.shared_experts.{p}_proj.weight" - for p in ("gate", "up") - ] + [ - f"{prefix}.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"{prefix}.mlp.shared_experts.gate_proj.weight^T, {prefix}.mlp.shared_experts.up_proj.weight^T -> {prefix_offset}.mlp.shared_experts.up_gate_proj.weight, fused_ffn", - ] - if is_fleet: - if using_sonic_moe: - aoa_config["aoa_statements"] += [ - f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0", - ] - else: - aoa_config["aoa_statements"] += [ - f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1", - ] - + aoa_config["aoa_statements"] += [ + f"{prefix}.mlp.shared_experts.gate_proj.weight^T, {prefix}.mlp.shared_experts.up_proj.weight^T -> {prefix_offset}.mlp.shared_experts.up_gate_proj.weight, fused_ffn", + ] + if is_fleet: + if using_sonic_moe: + aoa_config["aoa_statements"] += [ + f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0", + ] else: aoa_config["aoa_statements"] += [ - f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", + f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1", ] + else: + aoa_config["aoa_statements"] += [ + f"{prefix}.mlp.experts.$EXPERT_ID.gate_proj.weight^T, {prefix}.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {prefix_offset}.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", + ] + if is_fleet and (config.moe_grouped_gemm or using_sonic_moe): ep_weight1 = [] ep_weight2 = [] @@ -1000,17 +938,11 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig): aoa_statements += [ f"{model_prefix}layers.{num_head_empty_layers}.mlp.down_proj.weight^T -> model.layers.0.mlp.down_proj.weight", ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", - f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", - ] - else: - aoa_statements += [ - f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight -> model.layers.{num_head_empty_layers}.mlp.gate_proj.weight, model.layers.{num_head_empty_layers}.mlp.up_proj.weight, fused_ffn", - f"model.layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", - f"model.layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", - ] + aoa_statements += [ + f"{model_prefix}layers.{num_head_empty_layers}.mlp.up_gate_proj.weight -> model.layers.{num_head_empty_layers}.mlp.gate_proj.weight, model.layers.{num_head_empty_layers}.mlp.up_proj.weight, fused_ffn", + f"model.layers.{num_head_empty_layers}.mlp.gate_proj.weight^T -> model.layers.0.mlp.gate_proj.weight", + f"model.layers.{num_head_empty_layers}.mlp.up_proj.weight^T -> model.layers.0.mlp.up_proj.weight", + ] # layer 0 -> layer num_hidden_layers-1 for layer_idx in range(0, num_hidden_layers): @@ -1023,23 +955,16 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig): f"{prefix_offset}.post_attention_layernorm.weight -> {prefix}.post_attention_layernorm.weight", f"{prefix_offset}.self_attn.o_proj.weight^T -> {prefix}.self_attn.o_proj.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{prefix_offset}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{prefix_offset}.self_attn.qkv_proj.weight -> {prefix}.self_attn.q_proj.weight, {prefix}.self_attn.k_proj.weight, {prefix}.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - ] + aoa_statements += [ + f"{prefix_offset}.self_attn.qkv_proj.weight -> {prefix}.self_attn.q_proj.weight, {prefix}.self_attn.k_proj.weight, {prefix}.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + aoa_statements += [ + f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight" for x in ("q", "k", "v") + ] + if config.attention_bias: aoa_statements += [ - f"{prefix}.self_attn.{x}_proj.weight^T -> {prefix}.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") + f"{prefix_offset}.self_attn.qkv_proj.bias -> {prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0", ] - if config.attention_bias: - aoa_statements += [ - f"{prefix_offset}.self_attn.qkv_proj.bias -> {prefix}.self_attn.q_proj.bias, {prefix}.self_attn.k_proj.bias, {prefix}.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0", - ] # layer 1 -> layer num_hidden_layers-1 for layer_idx in range(1, num_hidden_layers): @@ -1068,57 +993,42 @@ def _gen_inv_aoa_config(cls, config: Glm4MoeConfig): f"{prefix_offset}.mlp.shared_experts.down_proj.weight^T -> {prefix}.mlp.shared_experts.down_proj.weight", ] - if not config.fuse_attention_ffn: + aoa_statements += [ + f"{prefix_offset}.mlp.shared_experts.up_gate_proj.weight -> {prefix_offset}.mlp.shared_experts.gate_proj.weight, {prefix_offset}.mlp.shared_experts.up_proj.weight, fused_ffn", + f"{prefix_offset}.mlp.shared_experts.gate_proj.weight^T -> {prefix}.mlp.shared_experts.gate_proj.weight", + f"{prefix_offset}.mlp.shared_experts.up_proj.weight^T -> {prefix}.mlp.shared_experts.up_proj.weight", + ] + if is_fleet: + if using_sonic_moe: + aoa_statements += [ + f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=0" + for expert_id in range(config.n_routed_experts) + ] + else: + aoa_statements += [ + f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=1" + for expert_id in range(config.n_routed_experts) + ] + else: + aoa_statements += [ + f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn" + for expert_id in range(config.n_routed_experts) + ] + if not using_sonic_moe: aoa_statements += ( [ - f"{prefix_offset}.mlp.shared_experts.{y}_proj.weight^T -> {prefix}.mlp.shared_experts.{y}_proj.weight" - for y in ("gate", "up") + f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.down_proj.weight" + for expert_id in range(config.n_routed_experts) ] + [ - f"{prefix_offset}.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> {prefix}.mlp.experts.$EXPERT_ID.{y}_proj.weight" - for y in ("gate", "up") + f"{prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.gate_proj.weight" + for expert_id in range(config.n_routed_experts) ] + [ - f"{prefix_offset}.mlp.experts.$EXPERT_ID.down_proj.weight^T -> {prefix}.mlp.experts.$EXPERT_ID.down_proj.weight" - ] - ) - else: - aoa_statements += [ - f"{prefix_offset}.mlp.shared_experts.up_gate_proj.weight -> {prefix_offset}.mlp.shared_experts.gate_proj.weight, {prefix_offset}.mlp.shared_experts.up_proj.weight, fused_ffn", - f"{prefix_offset}.mlp.shared_experts.gate_proj.weight^T -> {prefix}.mlp.shared_experts.gate_proj.weight", - f"{prefix_offset}.mlp.shared_experts.up_proj.weight^T -> {prefix}.mlp.shared_experts.up_proj.weight", - ] - if is_fleet: - if using_sonic_moe: - aoa_statements += [ - f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=0" - for expert_id in range(config.n_routed_experts) - ] - else: - aoa_statements += [ - f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, axis=1" - for expert_id in range(config.n_routed_experts) - ] - else: - aoa_statements += [ - f"{prefix_offset}.mlp.experts.{expert_id}.up_gate_proj.weight -> {prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight, {prefix_offset}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn" + f"{prefix_offset}.mlp.experts.{expert_id}.up_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.up_proj.weight" for expert_id in range(config.n_routed_experts) ] - if not using_sonic_moe: - aoa_statements += ( - [ - f"{prefix_offset}.mlp.experts.{expert_id}.down_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.down_proj.weight" - for expert_id in range(config.n_routed_experts) - ] - + [ - f"{prefix_offset}.mlp.experts.{expert_id}.gate_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.gate_proj.weight" - for expert_id in range(config.n_routed_experts) - ] - + [ - f"{prefix_offset}.mlp.experts.{expert_id}.up_proj.weight^T -> {prefix}.mlp.experts.{expert_id}.up_proj.weight" - for expert_id in range(config.n_routed_experts) - ] - ) + ) aoa_config = {"aoa_statements": aoa_statements} return aoa_config diff --git a/paddleformers/transformers/gpt_oss/modeling.py b/paddleformers/transformers/gpt_oss/modeling.py index 088c08603fb..e17e71317ab 100644 --- a/paddleformers/transformers/gpt_oss/modeling.py +++ b/paddleformers/transformers/gpt_oss/modeling.py @@ -590,20 +590,14 @@ def _gen_aoa_config(cls, config: GptOssConfig): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" + for x in ("q", "k", "v") + ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" + for x in ("q", "k", "v") + ] return aoa_config @@ -628,25 +622,14 @@ def _gen_inv_aoa_config(cls, config: GptOssConfig): f"{model_prefix}layers.$LAYER_ID.mlp.experts.down_proj_bias -> model.layers.$LAYER_ID.mlp.experts.down_proj_bias", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}, axis = 0", - ] - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.num_hidden_layers) - for x in ("q", "k", "v") - ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" + for x in ("q", "k", "v") + ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" + for x in ("q", "k", "v") + ] aoa_config = {"aoa_statements": aoa_statements} return aoa_config diff --git a/paddleformers/transformers/masking_utils.py b/paddleformers/transformers/masking_utils.py index 4248e271a34..89a5988dd2b 100644 --- a/paddleformers/transformers/masking_utils.py +++ b/paddleformers/transformers/masking_utils.py @@ -132,8 +132,8 @@ def create_causal_masks_and_row_indices( # Enables the efficient built-in causal mode (is_causal=True) # for FA backends (sdpa/flashmask), bypassing manual mask generation. # for third-party attention registered via _attn_implementation, default to bypass mask generation. - attn_impl = getattr(config, "_attn_implementation", "eager") - is_flash_backend = attn_impl != "eager" + _attn_implementation = getattr(config, "_attn_implementation", "eager") + is_flash_backend = _attn_implementation != "eager" is_fully_attended = attention_mask is None or (attention_mask is not None and attention_mask.cast("bool").all()) if is_flash_backend and is_fully_attended: if return_mapping: @@ -241,8 +241,8 @@ def create_causal_mask_and_row_indices( causal_mask = None row_indices = attn_mask_startend_row_indices else: - attn_impl = getattr(config, "_attn_implementation", "eager") - is_flash_backend = attn_impl != "eager" + _attn_implementation = getattr(config, "_attn_implementation", "eager") + is_flash_backend = _attn_implementation != "eager" # Check if the mask can be safely skipped # Condition: Must be Flash Backend AND No extra mask func AND No padding (mask is None or all True) diff --git a/paddleformers/transformers/paddleocr_vl/configuration.py b/paddleformers/transformers/paddleocr_vl/configuration.py index 619792cfaf2..8d5f9ff7d64 100644 --- a/paddleformers/transformers/paddleocr_vl/configuration.py +++ b/paddleformers/transformers/paddleocr_vl/configuration.py @@ -94,7 +94,6 @@ def __init__( use_cache=False, use_sparse_flash_attn=False, _attn_implementation="eager", - fuse_rms_norm=False, pad_token_id=0, bos_token_id=1, eos_token_id=2, @@ -141,7 +140,6 @@ def __init__( self.use_cache = use_cache self.use_sparse_flash_attn = use_sparse_flash_attn self._attn_implementation = _attn_implementation - self.fuse_rms_norm = fuse_rms_norm self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id diff --git a/paddleformers/transformers/qwen2/modeling.py b/paddleformers/transformers/qwen2/modeling.py index 2c8fa89b23c..c9ec81fd2cc 100644 --- a/paddleformers/transformers/qwen2/modeling.py +++ b/paddleformers/transformers/qwen2/modeling.py @@ -91,7 +91,6 @@ def __init__(self, config: Qwen2Config, layer_idx: int = 0): assert config.num_attention_heads // config.num_key_value_heads self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -108,36 +107,13 @@ def __init__(self, config: Qwen2Config, layer_idx: int = 0): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=True, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, @@ -160,43 +136,27 @@ def forward( **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" - if not self.fuse_attention_qkv: - # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - query_states = query_states.reshape([bsz, q_len, -1, self.head_dim]) - key_states = key_states.reshape([bsz, q_len, -1, self.head_dim]) - value_states = value_states.reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim] query_states = query_states.transpose(1, 2) @@ -240,7 +200,7 @@ def __init__(self, config: Qwen2Config, layer_idx: int): self.self_attn = Qwen2Attention(config, layer_idx) - self.mlp = Qwen2MLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen2MLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", @@ -319,33 +279,17 @@ def _gen_aoa_config(cls, config: Qwen2Config): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", - ] - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] # lm_head if config.tie_word_embeddings: @@ -365,42 +309,26 @@ def _gen_inv_aoa_config(cls, config: Qwen2Config): f"{model_prefix}norm.weight -> model.norm.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", - ] - for layer_id in range(config.num_hidden_layers): - for x in ("q", "k", "v"): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - ] - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for layer_id in range(config.num_hidden_layers): + for x in ("q", "k", "v"): + aoa_statements += [ + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" + ] + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") - ] - else: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + ] + for layer_id in range(config.num_hidden_layers): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", + f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", ] - for layer_id in range(config.num_hidden_layers): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", - f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", - ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] diff --git a/paddleformers/transformers/qwen2_5_vl/modeling.py b/paddleformers/transformers/qwen2_5_vl/modeling.py index e16afab24f2..9d9815d5b16 100644 --- a/paddleformers/transformers/qwen2_5_vl/modeling.py +++ b/paddleformers/transformers/qwen2_5_vl/modeling.py @@ -307,31 +307,15 @@ def _gen_aoa_config(cls, config: Qwen2_5_VLConfig): ] + [f"visual.merger.mlp.{x}.bias -> {visual_prefix}merger.mlp.{x}.bias" for x in ("0", "2")] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}", - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}, axis=0", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}, axis=0", + ] # FFN - if not config.text_config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] # Qwen2_5_VLModel without lm_head if cls.base_model_prefix: @@ -391,41 +375,25 @@ def _gen_inv_aoa_config(cls, config: Qwen2_5_VLConfig): ] + [f"{visual_prefix}merger.mlp.{x}.bias -> visual.merger.mlp.{x}.bias" for x in ("0", "2")] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", - f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}, axis=0", - ] - aoa_config["aoa_statements"] += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.text_config.num_hidden_layers) - for x in ("q", "k", "v") - ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", + f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.bias, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.bias, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}, axis=0", + ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" + for layer_id in range(config.text_config.num_hidden_layers) + for x in ("q", "k", "v") + ] # FFN - if not config.text_config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn" - ] - aoa_config["aoa_statements"] += [ - f"model.layers.{layer_id}.mlp.{x}_proj.weight^T -> model.layers.{layer_id}.mlp.{x}_proj.weight" - for layer_id in range(config.text_config.num_hidden_layers) - for x in ("gate", "up") - ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> {llm_prefix}layers.$LAYER_ID.mlp.gate_proj.weight, {llm_prefix}layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn" + ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.layers.{layer_id}.mlp.{x}_proj.weight" + for layer_id in range(config.text_config.num_hidden_layers) + for x in ("gate", "up") + ] # Qwen2_5_VLModel without lm_head if cls.base_model_prefix: @@ -803,7 +771,6 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None ) self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -820,36 +787,13 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=True, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=True, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, config.hidden_size, @@ -872,46 +816,29 @@ def forward( attn_mask_startend_row_indices: Optional[paddle.Tensor] = None, **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - if not self.fuse_attention_qkv: - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.reshape(bsz, q_len, -1, self.head_dim) - key_states = key_states.reshape(bsz, q_len, -1, self.head_dim) - value_states = value_states.reshape(bsz, q_len, -1, self.head_dim) - + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - # mix_layer = mix_layer.reshape(target_shape) - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + # mix_layer = mix_layer.reshape(target_shape) + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) @@ -960,7 +887,7 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: int): ) self.self_attn = Qwen2_5_VLAttention(config, layer_idx) - self.mlp = Qwen2MLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen2MLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", diff --git a/paddleformers/transformers/qwen2_moe/modeling.py b/paddleformers/transformers/qwen2_moe/modeling.py index 8d47592755a..dcca6f0ec27 100644 --- a/paddleformers/transformers/qwen2_moe/modeling.py +++ b/paddleformers/transformers/qwen2_moe/modeling.py @@ -81,7 +81,6 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int = 0): assert config.num_attention_heads // config.num_key_value_heads self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -98,36 +97,13 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int = 0): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.qkv_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.qkv_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.qkv_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.qkv_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.qkv_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, @@ -149,43 +125,27 @@ def forward( **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" - if not self.fuse_attention_qkv: - # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - query_states = query_states.reshape([bsz, q_len, -1, self.head_dim]) - key_states = key_states.reshape([bsz, q_len, -1, self.head_dim]) - value_states = value_states.reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim] query_states = query_states.transpose(1, 2) @@ -273,15 +233,13 @@ def __init__(self, config): ) self.experts = nn.LayerList( [ - Qwen2MoeMLP( - config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn - ) + Qwen2MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True) for _ in range(self.num_experts) ] ) self.shared_expert = Qwen2MoeMLP( - config, intermediate_size=config.shared_expert_intermediate_size, fuse_up_gate=config.fuse_attention_ffn + config, intermediate_size=config.shared_expert_intermediate_size, fuse_up_gate=True ) self.shared_expert_gate = GeneralLinear.create(config.hidden_size, 1, has_bias=False, linear_type="default") @@ -359,7 +317,7 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int): self.mlp = Qwen2MoeSparseMoeBlock(config) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = Qwen2MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen2MoeMLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, @@ -528,39 +486,19 @@ def _gen_aoa_config(cls, config: Qwen2MoeConfig): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.qkv_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.qkv_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.qkv_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.shared_expert.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.{p}_proj.weight" - for p in ("gate", "up") - ] + [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn", - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn", + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", + ] # lm_head if config.tie_word_embeddings: @@ -583,53 +521,33 @@ def _gen_inv_aoa_config(cls, config: Qwen2MoeConfig): f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert_gate.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert_gate.weight, dtype='bfloat16'", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.qkv_bias: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for layer_id in range(config.num_hidden_layers): + for x in ("q", "k", "v"): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" ] - else: + if config.qkv_bias: aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - for layer_id in range(config.num_hidden_layers): - for x in ("q", "k", "v"): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - ] - if config.qkv_bias: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight" - for y in ("gate", "up") - ] + [ - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight" - for y in ("gate", "up") - ] - else: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight, fused_ffn", + f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn", + ] + for layer_id in range(config.num_hidden_layers): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight, fused_ffn", - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn", + f"model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight", + f"model.layers.{layer_id}.mlp.shared_expert.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.up_proj.weight", ] - for layer_id in range(config.num_hidden_layers): + for expert_id in range(config.num_experts): aoa_statements += [ - f"model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight", - f"model.layers.{layer_id}.mlp.shared_expert.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.up_proj.weight", + f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight", + f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight", ] - for expert_id in range(config.num_experts): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight", - f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight", - ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] diff --git a/paddleformers/transformers/qwen3/modeling.py b/paddleformers/transformers/qwen3/modeling.py index 99506b75679..7e2283343cc 100644 --- a/paddleformers/transformers/qwen3/modeling.py +++ b/paddleformers/transformers/qwen3/modeling.py @@ -92,7 +92,6 @@ def __init__(self, config: Qwen3Config, layer_idx: int = 0): self.tensor_parallel = config.tensor_model_parallel_size > 1 self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -109,36 +108,13 @@ def __init__(self, config: Qwen3Config, layer_idx: int = 0): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, @@ -175,46 +151,29 @@ def forward( **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" - if not self.fuse_attention_qkv: - # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - # Add qk norm for Qwen3 model. - query_states = self.q_norm(query_states.reshape([bsz, q_len, -1, self.head_dim])) - key_states = self.k_norm(key_states.reshape([bsz, q_len, -1, self.head_dim])) - value_states = value_states.reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) - query_states = self.q_norm(query_states) - key_states = self.k_norm(key_states) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim] query_states = query_states.transpose(1, 2) @@ -257,7 +216,7 @@ def __init__(self, config: Qwen3Config, layer_idx: int): self.self_attn = Qwen3Attention(config, layer_idx) - self.mlp = Qwen3MLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen3MLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", @@ -337,35 +296,18 @@ def _gen_aoa_config(cls, config: Qwen3Config): } # attention qkv - if not config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.attention_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.attention_bias: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] # FFN - if not config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] # lm_head if config.tie_word_embeddings: @@ -387,48 +329,30 @@ def _gen_inv_aoa_config(cls, config: Qwen3Config): f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.attention_bias: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for layer_id in range(config.num_hidden_layers): + for x in ("q", "k", "v"): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" ] - else: + if config.attention_bias: aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - for layer_id in range(config.num_hidden_layers): - for x in ("q", "k", "v"): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - ] - if config.attention_bias: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", - ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.{y}_proj.weight" - for y in ("gate", "up") - ] - else: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + ] + for layer_id in range(config.num_hidden_layers): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn", + f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", + f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", ] - for layer_id in range(config.num_hidden_layers): - aoa_statements += [ - f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight", - f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight", - ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] - aoa_config = {"aoa_statements": aoa_statements} return aoa_config diff --git a/paddleformers/transformers/qwen3_moe/modeling.py b/paddleformers/transformers/qwen3_moe/modeling.py index b1e3d2f2f8e..4d225049958 100644 --- a/paddleformers/transformers/qwen3_moe/modeling.py +++ b/paddleformers/transformers/qwen3_moe/modeling.py @@ -130,7 +130,6 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int = 0): self.tensor_parallel = config.tensor_model_parallel_size > 1 self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -147,36 +146,13 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int = 0): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, @@ -212,46 +188,29 @@ def forward( **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" - if not self.fuse_attention_qkv: - # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism) - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - # Add qk norm for Qwen3MoE model. - query_states = self.q_norm(query_states.reshape([bsz, q_len, -1, self.head_dim])) - key_states = self.k_norm(key_states.reshape([bsz, q_len, -1, self.head_dim])) - value_states = value_states.reshape([bsz, q_len, -1, self.head_dim]) + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) - query_states = self.q_norm(query_states) - key_states = self.k_norm(key_states) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim] query_states = query_states.transpose(1, 2) @@ -339,9 +298,7 @@ def __init__(self, config): ) self.experts = nn.LayerList( [ - Qwen3MoeMLP( - config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=config.fuse_attention_ffn - ) + Qwen3MoeMLP(config, intermediate_size=config.moe_intermediate_size, fuse_up_gate=True) for _ in range(self.num_experts) ] ) @@ -434,7 +391,7 @@ def __init__(self, config: Qwen3MoeConfig, layer_idx: int): ) else: # num_experts == 0 or this layer is not sparse layer - self.mlp = Qwen3MoeMLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen3MoeMLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, @@ -787,46 +744,29 @@ def _gen_aoa_config(cls, config: Qwen3MoeConfig): ] # attention qkv - if not config.fuse_attention_qkv: + aoa_config["aoa_statements"] += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] - if config.attention_bias: + + # FFN + if getattr(cls, "is_fleet", False): + if using_sonic_moe: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0", ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", - ] - if config.attention_bias: + else: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1", ] - # FFN - if not config.fuse_attention_ffn: + else: aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{p}_proj.weight" - for p in ("gate", "up") + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", ] - else: - if getattr(cls, "is_fleet", False): - if using_sonic_moe: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=0", - ] - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, axis=1", - ] - - else: - aoa_config["aoa_statements"] += [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", - ] if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe): for layer_idx in range(0, config.num_hidden_layers): @@ -881,77 +821,55 @@ def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig): f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight", ] - if not config.fuse_attention_qkv: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - if config.attention_bias: + aoa_statements += [ + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + ] + for layer_id in range(config.num_hidden_layers): + for x in ("q", "k", "v"): aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") + f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" ] - else: + if config.attention_bias: aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}", + f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", ] + + if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe): for layer_id in range(config.num_hidden_layers): - for x in ("q", "k", "v"): - aoa_statements += [ - f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight" - ] - if config.attention_bias: + ep_weight1 = [] + ep_weight2 = [] + for expert_id in range(num_experts): + ep_weight1.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight") + ep_weight2.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight") + group_gemm1 = ",".join(ep_weight1) + group_gemm2 = ",".join(ep_weight2) aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight1 -> {group_gemm1}, axis=0" + f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight2 -> {group_gemm2}, axis=0" ] - if not config.fuse_attention_ffn: - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight" - for y in ("gate", "up") - ] - aoa_statements += [ - f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight", - ] - else: - if getattr(cls, "is_fleet", False) and (config.moe_grouped_gemm or using_sonic_moe): - for layer_id in range(config.num_hidden_layers): - ep_weight1 = [] - ep_weight2 = [] - for expert_id in range(num_experts): - ep_weight1.append( - f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight" - ) - ep_weight2.append(f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight") - group_gemm1 = ",".join(ep_weight1) - group_gemm2 = ",".join(ep_weight2) - aoa_statements += [ - f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight1 -> {group_gemm1}, axis=0" - f"{model_prefix}layers.{layer_id}.mlp.grouped_gemm_experts.weight2 -> {group_gemm2}, axis=0" - ] - - for layer_id in range(config.num_hidden_layers): - for expert_id in range(num_experts): - if getattr(cls, "is_fleet", False): - if using_sonic_moe: - aoa_statements += [ - f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=0", - ] - else: - aoa_statements += [ - f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=1", - ] - else: + for layer_id in range(config.num_hidden_layers): + for expert_id in range(num_experts): + if getattr(cls, "is_fleet", False): + if using_sonic_moe: aoa_statements += [ - f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn", + f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=0", ] - - if not using_sonic_moe: + else: aoa_statements += [ - f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight", - f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight", - f"model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight", + f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, axis=1", ] + else: + aoa_statements += [ + f"{model_prefix}layers.{layer_id}.mlp.experts.{expert_id}.up_gate_proj.weight -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight, model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight, fused_ffn", + ] + + if not using_sonic_moe: + aoa_statements += [ + f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight", + f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight", + f"model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.down_proj.weight", + ] if config.tie_word_embeddings: aoa_statements += ["lm_head.weight -> _"] diff --git a/paddleformers/transformers/qwen3_next/modeling.py b/paddleformers/transformers/qwen3_next/modeling.py index 5780ac0416f..6014a6e0380 100644 --- a/paddleformers/transformers/qwen3_next/modeling.py +++ b/paddleformers/transformers/qwen3_next/modeling.py @@ -288,9 +288,12 @@ def extra_repr(self): class Qwen3NextAttention(Qwen3MoeAttention): def __init__(self, config: Qwen3NextConfig, layer_idx: int): super().__init__(config, layer_idx) - self.q_proj = GeneralLinear.create( + kv_hidden_size = self.config.num_key_value_heads * self.head_dim + q_hidden_size = self.config.num_attention_heads * self.head_dim * 2 + + self.qkv_proj = GeneralLinear.create( config.hidden_size, - config.num_attention_heads * self.head_dim * 2, + q_hidden_size + 2 * kv_hidden_size, has_bias=config.attention_bias, config=config, tp_plan="colwise", @@ -315,16 +318,26 @@ def forward( cache_position: Optional[Tensor] = None, **kwargs, ) -> tuple[Tensor, Optional[Tensor]]: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - + mix_layer = self.qkv_proj(hidden_states) if self.sequence_parallel: max_sequence_length = self.config.max_sequence_length bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups * 2 + 2) * self.head_dim, + ] else: bsz, q_len, _ = hidden_states.shape + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups * 2 + 2) * self.head_dim] + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim * 2, self.head_dim, self.head_dim], + axis=-1, + ) query_states, gate = paddle.chunk(query_states.view(bsz, q_len, -1, self.head_dim * 2), chunks=2, dim=-1) gate = gate.reshape(bsz, q_len, -1) @@ -933,13 +946,20 @@ def _gen_aoa_config(cls, config: Qwen3NextConfig): f"model.layers.$LAYER_ID.linear_attn.in_proj_qkvz.weight^T -> {model_prefix}layers.$LAYER_ID.linear_attn.in_proj_qkvz.weight", f"model.layers.$LAYER_ID.linear_attn.norm.weight -> {model_prefix}layers.$LAYER_ID.linear_attn.norm.weight", f"model.layers.$LAYER_ID.linear_attn.out_proj.weight^T -> {model_prefix}layers.$LAYER_ID.linear_attn.out_proj.weight", + f"model.layers.$LAYER_ID.self_attn.o_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.o_proj.weight", + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.down_proj.weight", + f"model.layers.$LAYER_ID.mlp.shared_expert.down_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.down_proj.weight", ] - # self_attn + # attention qkv aoa_statements += [ - f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v", "o") + f"model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}", ] + if config.attention_bias: + aoa_statements += [ + f"model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias -> {model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0", + ] + aoa_statements += [ f"model.layers.$LAYER_ID.self_attn.{x}_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.{x}_norm.weight" for x in ("q", "k") @@ -947,12 +967,8 @@ def _gen_aoa_config(cls, config: Qwen3NextConfig): # experts aoa_statements += [ - f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{x}_proj.weight" - for x in ("gate", "up", "down") - ] - aoa_statements += [ - f"model.layers.$LAYER_ID.mlp.shared_expert.{x}_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.{x}_proj.weight" - for x in ("gate", "up", "down") + f"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight, fused_ffn", + f"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> {model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight, fused_ffn", ] return {"aoa_statements": aoa_statements} diff --git a/paddleformers/transformers/qwen3_vl/modeling.py b/paddleformers/transformers/qwen3_vl/modeling.py index 52b76c3620b..473a1bc50fc 100644 --- a/paddleformers/transformers/qwen3_vl/modeling.py +++ b/paddleformers/transformers/qwen3_vl/modeling.py @@ -370,30 +370,18 @@ def _gen_aoa_config(cls, config: Qwen3VLConfig): ] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: + aoa_config["aoa_statements"] += [ + f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" + f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.k_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.v_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" ] # FFN - if not config.text_config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.mlp.{p}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", - ] + aoa_config["aoa_statements"] += [ + f"model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight^T, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight, fused_ffn", + ] # Qwen3_VLModel without lm_head if cls._tied_weights_keys: @@ -471,40 +459,28 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLConfig): ] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", - ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", + ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight" + for layer_id in range(config.text_config.num_hidden_layers) + for x in ("q", "k", "v") + ] + if config.attention_bias: aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.text_config.num_hidden_layers) - for x in ("q", "k", "v") + f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.k_proj.bias, model.language_model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", ] # FFN - if not config.text_config.fuse_attention_ffn: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.mlp.{p}_proj.weight^T -> model.language_model.layers.$LAYER_ID.mlp.{p}_proj.weight" - for p in ("gate", "up") - ] - else: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.language_model.layers.$LAYER_ID.mlp.gate_proj.weight, model.language_model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn" - ] - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.mlp.{x}_proj.weight" - for layer_id in range(config.text_config.num_hidden_layers) - for x in ("gate", "up") - ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> {llm_prefix}layers.$LAYER_ID.mlp.gate_proj.weight, {llm_prefix}layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn" + ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.{layer_id}.mlp.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.mlp.{x}_proj.weight" + for layer_id in range(config.text_config.num_hidden_layers) + for x in ("gate", "up") + ] # Qwen3VLModel without lm_head if cls._tied_weights_keys: @@ -915,7 +891,6 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: Optional[int] = None): # ) self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -932,36 +907,13 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: Optional[int] = None): kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, config.hidden_size, @@ -984,46 +936,29 @@ def forward( attn_mask_startend_row_indices: Optional[paddle.Tensor] = None, **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - if not self.fuse_attention_qkv: - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.reshape(bsz, q_len, -1, self.head_dim) - key_states = key_states.reshape(bsz, q_len, -1, self.head_dim) - value_states = value_states.reshape(bsz, q_len, -1, self.head_dim) - + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - # mix_layer = mix_layer.reshape(target_shape) - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + # mix_layer = mix_layer.reshape(target_shape) + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) # apply qk_norm query_states = self.q_norm(query_states) @@ -1067,7 +1002,7 @@ def __init__(self, config: Qwen3VLTextConfig, layer_idx: int): self.hidden_size = config.hidden_size self.self_attn = Qwen3VLTextAttention(config, layer_idx) - self.mlp = Qwen3VLTextMLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen3VLTextMLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", diff --git a/paddleformers/transformers/qwen3_vl/modeling_fleet.py b/paddleformers/transformers/qwen3_vl/modeling_fleet.py index a156b10e924..369d7b4e4d2 100644 --- a/paddleformers/transformers/qwen3_vl/modeling_fleet.py +++ b/paddleformers/transformers/qwen3_vl/modeling_fleet.py @@ -336,7 +336,6 @@ class Qwen3VLTextProvider(GPTModelProvider): max_sequence_length: int = 262144 multimodal_embedding: bool = False _save_to_hf: bool = False - use_flash_attention: bool = True use_fused_linear_cross_entropy: bool = True high_precision_rope: bool = True moe_grouped_gemm: bool = True @@ -393,7 +392,6 @@ class Qwen3VLVisionProvider(TransformerConfig): class_token_len: int = 1 high_precision_rope: bool = True # _save_to_hf: bool = False - # use_flash_attention: bool = True # use_fused_linear_cross_entropy: bool = True # fuse_linear: bool = True # transform_rules: dict = field(default_factory=lambda: { diff --git a/paddleformers/transformers/qwen3_vl_moe/modeling.py b/paddleformers/transformers/qwen3_vl_moe/modeling.py index 32e57a562d0..0e300ba4504 100644 --- a/paddleformers/transformers/qwen3_vl_moe/modeling.py +++ b/paddleformers/transformers/qwen3_vl_moe/modeling.py @@ -354,6 +354,12 @@ def _gen_aoa_config(cls, config: Qwen3VLMoeConfig): f"model.language_model.layers.{layer_id}.self_attn.q_proj.weight^T, model.language_model.layers.{layer_id}.self_attn.k_proj.weight^T, model.language_model.layers.{layer_id}.self_attn.v_proj.weight^T -> {llm_prefix}{layer_id + 1}.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" for layer_id in range(config.text_config.num_hidden_layers) ] + if config.attention_bias: + aoa_config["aoa_statements"] += [ + f"model.language_model.layers.{layer_id}.self_attn.q_proj.bias, model.language_model.layers.{layer_id}.self_attn.k_proj.bias, model.language_model.layers.{layer_id}.self_attn.v_proj.bias -> {llm_prefix}{layer_id + 1}.self_attn.qkv_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" + for layer_id in range(config.text_config.num_hidden_layers) + ] + aoa_config["aoa_statements"] += [ lm_state for layer_id in range(config.text_config.num_hidden_layers) @@ -535,6 +541,12 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLMoeConfig): f"{llm_prefix}{layer_id + 1}.self_attn.qkv_proj.weight -> model.language_model.layers.{layer_id}.self_attn.q_proj.weight, model.language_model.layers.{layer_id}.self_attn.k_proj.weight, model.language_model.layers.{layer_id}.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}" for layer_id in range(config.text_config.num_hidden_layers) ] + if config.attention_bias: + aoa_config["aoa_statements"] += [ + f"{llm_prefix}{layer_id + 1}.self_attn.qkv_proj.bias -> model.language_model.layers.{layer_id}.self_attn.q_proj.bias, model.language_model.layers.{layer_id}.self_attn.k_proj.bias, model.language_model.layers.{layer_id}.self_attn.v_proj.bias, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}" + for layer_id in range(config.text_config.num_hidden_layers) + ] + aoa_config["aoa_statements"] += [ f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight" for layer_id in range(config.text_config.num_hidden_layers) @@ -653,19 +665,9 @@ def _gen_aoa_config(cls, config: Qwen3VLMoeConfig): ] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias -> {llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" - ] + aoa_config["aoa_statements"] += [ + f"model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight^T, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight^T -> {llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups={config.text_config.num_key_value_heads}" + ] # Qwen3_VLMoeModel without lm_head if cls._tied_weights_keys: @@ -745,24 +747,14 @@ def _gen_inv_aoa_config(cls, config: Qwen3VLMoeConfig): ] # attention qkv - if not config.text_config.fuse_attention_qkv: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.weight" - for x in ("q", "k", "v") - ] - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.{x}_proj.bias -> model.language_model.layers.$LAYER_ID.self_attn.{x}_proj.bias" - for x in ("q", "k", "v") - ] - else: - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.language_model.layers.$LAYER_ID.self_attn.q_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.k_proj.weight, model.language_model.layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", - ] - aoa_config["aoa_statements"] += [ - f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight" - for layer_id in range(config.text_config.num_hidden_layers) - for x in ("q", "k", "v") - ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> {llm_prefix}layers.$LAYER_ID.self_attn.q_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.k_proj.weight, {llm_prefix}layers.$LAYER_ID.self_attn.v_proj.weight, fused_qkv, num_heads={config.text_config.num_attention_heads}, num_key_value_groups = {config.text_config.num_key_value_heads}", + ] + aoa_config["aoa_statements"] += [ + f"{llm_prefix}layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.language_model.layers.{layer_id}.self_attn.{x}_proj.weight" + for layer_id in range(config.text_config.num_hidden_layers) + for x in ("q", "k", "v") + ] # Qwen3VLMoeModel without lm_head if cls._tied_weights_keys: @@ -1193,7 +1185,6 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: Optional[int] = None ) self.sequence_parallel = config.sequence_parallel - self.fuse_attention_qkv = config.fuse_attention_qkv self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads if config.tensor_model_parallel_size > 1: @@ -1210,36 +1201,13 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: Optional[int] = None kv_hidden_size = self.config.num_key_value_heads * self.head_dim q_hidden_size = self.config.num_attention_heads * self.head_dim - if not self.fuse_attention_qkv: - self.q_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.k_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - self.v_proj = GeneralLinear.create( - config.hidden_size, - kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) - else: - self.qkv_proj = GeneralLinear.create( - config.hidden_size, - q_hidden_size + 2 * kv_hidden_size, - has_bias=config.attention_bias, - config=config, - tp_plan="colwise", - ) + self.qkv_proj = GeneralLinear.create( + config.hidden_size, + q_hidden_size + 2 * kv_hidden_size, + has_bias=config.attention_bias, + config=config, + tp_plan="colwise", + ) self.o_proj = GeneralLinear.create( q_hidden_size, config.hidden_size, @@ -1262,46 +1230,29 @@ def forward( attn_mask_startend_row_indices: Optional[paddle.Tensor] = None, **kwargs, ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: - if not self.fuse_attention_qkv: - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - else: - bsz, q_len, _ = hidden_states.shape - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.reshape(bsz, q_len, -1, self.head_dim) - key_states = key_states.reshape(bsz, q_len, -1, self.head_dim) - value_states = value_states.reshape(bsz, q_len, -1, self.head_dim) - + mix_layer = self.qkv_proj(hidden_states) + if self.sequence_parallel: + max_sequence_length = self.config.max_sequence_length + bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length + q_len = max_sequence_length + target_shape = [ + bsz, + q_len, + self.num_key_value_heads, + (self.num_key_value_groups + 2) * self.head_dim, + ] else: - mix_layer = self.qkv_proj(hidden_states) - if self.sequence_parallel: - max_sequence_length = self.config.max_sequence_length - bsz = hidden_states.shape[0] * self.config.tensor_model_parallel_size // max_sequence_length - q_len = max_sequence_length - target_shape = [ - bsz, - q_len, - self.num_key_value_heads, - (self.num_key_value_groups + 2) * self.head_dim, - ] - else: - target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] - # mix_layer = mix_layer.reshape(target_shape) - mix_layer = paddle.reshape_(mix_layer, target_shape) - query_states, key_states, value_states = paddle.split( - mix_layer, - num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], - axis=-1, - ) - if self.gqa_or_mqa: - # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) - query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) + target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim] + # mix_layer = mix_layer.reshape(target_shape) + mix_layer = paddle.reshape_(mix_layer, target_shape) + query_states, key_states, value_states = paddle.split( + mix_layer, + num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim], + axis=-1, + ) + if self.gqa_or_mqa: + # query_states = query_states.reshape([0, 0, self.num_heads, self.head_dim]) + query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim]) # apply qk_norm query_states = self.q_norm(query_states) @@ -1356,7 +1307,7 @@ def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int): ): self.mlp = Qwen3VLMoeTextSparseMoeBlock(config) else: - self.mlp = Qwen3VLMoeTextMLP(config, fuse_up_gate=config.fuse_attention_ffn) + self.mlp = Qwen3VLMoeTextMLP(config, fuse_up_gate=True) self.input_layernorm = GeneralNorm.create( config=config, norm_type="rms_norm", diff --git a/scripts/regression/test_dpo_tiny-random-glm4moe.py b/scripts/regression/test_dpo_tiny-random-glm4moe.py index 344eca6be17..8b6029502fa 100644 --- a/scripts/regression/test_dpo_tiny-random-glm4moe.py +++ b/scripts/regression/test_dpo_tiny-random-glm4moe.py @@ -133,8 +133,6 @@ def test_dpo_full(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "full.yaml") @@ -242,8 +240,6 @@ def test_dpo_full_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml") @@ -295,8 +291,6 @@ def test_dpo_lora_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml") diff --git a/scripts/regression/test_pt_tiny-random-glm4moe.py b/scripts/regression/test_pt_tiny-random-glm4moe.py index dd707b9ea3b..b6a05289546 100644 --- a/scripts/regression/test_pt_tiny-random-glm4moe.py +++ b/scripts/regression/test_pt_tiny-random-glm4moe.py @@ -131,8 +131,6 @@ def test_pt_full(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", } config_path = os.path.join(CONFIG_PATH, "full.yaml") updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args) @@ -183,8 +181,6 @@ def test_pt_lora(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", } config_path = os.path.join(CONFIG_PATH, "lora.yaml") updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args) @@ -242,8 +238,6 @@ def test_pt_full_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", } config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml") updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args) @@ -293,8 +287,6 @@ def test_pt_lora_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", } config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml") updated_config_path = self.pttrain_tester.update_training_args(config_path, output_dir, update_args) diff --git a/scripts/regression/test_sft_tiny-random-glm4moe.py b/scripts/regression/test_sft_tiny-random-glm4moe.py index face70c85fa..c42d7531cb5 100644 --- a/scripts/regression/test_sft_tiny-random-glm4moe.py +++ b/scripts/regression/test_sft_tiny-random-glm4moe.py @@ -132,8 +132,6 @@ def test_sft_full(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "full.yaml") @@ -184,8 +182,6 @@ def test_sft_lora(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "lora.yaml") @@ -244,8 +240,6 @@ def test_sft_full_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "full_tp_pp.yaml") @@ -296,8 +290,6 @@ def test_sft_lora_tp_pp(self): "output_dir": output_dir, "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "lora_tp_pp.yaml") @@ -358,8 +350,6 @@ def test_sft_full_function_call(self): "max_steps": MAX_STEPS, "save_steps": SAVE_STEPS, "sharding": "stage1", - "fuse_attention_qkv": "true", - "fuse_attention_ffn": "true", "template": TEMPLATE, } config_path = os.path.join(CONFIG_PATH, "full_function_call.yaml") diff --git a/tests/config/benchmark/config/pt/GLM4.5-Air.yaml b/tests/config/benchmark/config/pt/GLM4.5-Air.yaml index eeeffc49e88..3a171c8ff6b 100644 --- a/tests/config/benchmark/config/pt/GLM4.5-Air.yaml +++ b/tests/config/benchmark/config/pt/GLM4.5-Air.yaml @@ -15,7 +15,7 @@ prefetch_factor: 24 ### model model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -66,7 +66,6 @@ using_sonic_moe: true moe_grouped_gemm: true apply_rope_fusion: true -fuse_rms_norm: true fp32_residual_connection: false # moe_router_force_load_balancing: true @@ -94,9 +93,6 @@ save_checkpoint_format: flex_checkpoint load_checkpoint_format: flex_checkpoint continue_training: true -fuse_attention_qkv: true -fuse_attention_ffn: true - tensorwise_offload_optimizer: true benchmark: true diff --git a/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml b/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml index a09207380df..f0b99da5bd0 100644 --- a/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml +++ b/tests/config/benchmark/config/pt/GLM4.5-Air_64k.yaml @@ -15,7 +15,7 @@ padding_free: true ### model model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -63,7 +63,6 @@ moe_grouped_gemm: true moe_deep_gemm: false apply_rope_fusion: true -fuse_rms_norm: true moe_router_force_load_balancing: false sequence_parallel: true @@ -98,7 +97,5 @@ benchmark: true dataloader_num_workers: 24 prefetch_factor: 24 -fuse_attention_qkv: true -fuse_attention_ffn: true fp32_residual_connection: false tensorwise_offload_optimizer: true diff --git a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml index 2ab9e71e9dc..2721cd5142a 100644 --- a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml +++ b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base-64k.yaml @@ -22,7 +22,7 @@ prefetch_factor: 24 ### model model_name_or_path: /root/paddlejob/gpfs/huangjiyi/Models/Qwen3-30B-A3B -attn_impl: flashmask +_attn_implementation: flashmask use_qk_norm: true ### finetuning @@ -75,7 +75,6 @@ split_param: true stage1_overlap: true apply_rope_fusion: true -fuse_rms_norm: true moe_deep_gemm: true moe_grouped_gemm: true moe_router_fusion: true @@ -98,9 +97,6 @@ amp_master_grad: true bf16: true fp16_opt_level: O2 -fuse_attention_qkv: true -fuse_attention_ffn: true - save_checkpoint_format: "flex_checkpoint" load_checkpoint_format: "flex_checkpoint" diff --git a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml index 82c4ed72737..377efbc6045 100644 --- a/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml +++ b/tests/config/benchmark/config/pt/Qwen3-30B-A3B-Base.yaml @@ -24,7 +24,7 @@ prefetch_factor: 24 ### model model_name_or_path: Qwen/Qwen3-30B-A3B-Base -attn_impl: flashmask +_attn_implementation: flashmask use_qk_norm: true ### finetuning @@ -75,7 +75,6 @@ stage1_overlap: true sd_release_grads: true apply_rope_fusion: true -fuse_rms_norm: true moe_grouped_gemm: true moe_ep_barrier: false moe_router_fusion: true @@ -94,9 +93,6 @@ bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true - save_checkpoint_format: "flex_checkpoint" load_checkpoint_format: "flex_checkpoint" diff --git a/tests/config/benchmark/config/sft/GLM4.5-Air.yaml b/tests/config/benchmark/config/sft/GLM4.5-Air.yaml index dcb2c917180..d7c8fa7b2a8 100644 --- a/tests/config/benchmark/config/sft/GLM4.5-Air.yaml +++ b/tests/config/benchmark/config/sft/GLM4.5-Air.yaml @@ -15,7 +15,7 @@ prefetch_factor: 24 ### model model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -66,7 +66,6 @@ using_sonic_moe: true moe_grouped_gemm: true apply_rope_fusion: true -fuse_rms_norm: true fp32_residual_connection: false # moe_router_force_load_balancing: true @@ -94,9 +93,6 @@ save_checkpoint_format: flex_checkpoint load_checkpoint_format: flex_checkpoint continue_training: true -fuse_attention_qkv: true -fuse_attention_ffn: true - tensorwise_offload_optimizer: true benchmark: true diff --git a/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml b/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml index f37a5e55422..57e65d3a9ef 100644 --- a/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml +++ b/tests/config/benchmark/config/sft/GLM4.5-Air_64k.yaml @@ -16,7 +16,7 @@ padding_free: true ### model model_name_or_path: /root/paddlejob/gpfs/efficient_benchmark/huggingface/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -64,7 +64,6 @@ moe_grouped_gemm: true moe_deep_gemm: false apply_rope_fusion: true -fuse_rms_norm: true moe_router_force_load_balancing: false sequence_parallel: true @@ -99,7 +98,5 @@ benchmark: true dataloader_num_workers: 24 prefetch_factor: 24 -fuse_attention_qkv: true -fuse_attention_ffn: true fp32_residual_connection: false tensorwise_offload_optimizer: true diff --git a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml index 42c680a14c4..5e41e5f3721 100644 --- a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml +++ b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base-64k.yaml @@ -22,7 +22,7 @@ prefetch_factor: 24 ### model model_name_or_path: /root/paddlejob/gpfs/huangjiyi/Models/Qwen3-30B-A3B -attn_impl: flashmask +_attn_implementation: flashmask use_qk_norm: true ### finetuning @@ -75,7 +75,6 @@ split_param: true stage1_overlap: true apply_rope_fusion: true -fuse_rms_norm: true moe_deep_gemm: true moe_grouped_gemm: true moe_router_fusion: true @@ -98,9 +97,6 @@ amp_master_grad: true bf16: true fp16_opt_level: O2 -fuse_attention_qkv: true -fuse_attention_ffn: true - save_checkpoint_format: "flex_checkpoint" load_checkpoint_format: "flex_checkpoint" diff --git a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml index 4f65349d711..c6b571dfda8 100644 --- a/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml +++ b/tests/config/benchmark/config/sft/Qwen3-30B-A3B-Base.yaml @@ -24,7 +24,7 @@ prefetch_factor: 24 ### model model_name_or_path: Qwen/Qwen3-30B-A3B-Base -attn_impl: flashmask +_attn_implementation: flashmask use_qk_norm: true ### finetuning @@ -75,7 +75,6 @@ stage1_overlap: true sd_release_grads: true apply_rope_fusion: true -fuse_rms_norm: true moe_grouped_gemm: true moe_ep_barrier: false moe_router_fusion: true @@ -95,9 +94,6 @@ bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true - save_checkpoint_format: "flex_checkpoint" load_checkpoint_format: "flex_checkpoint" diff --git a/tests/config/ci/glm45_dpo.yaml b/tests/config/ci/glm45_dpo.yaml index 0de6b4ba544..113424153e9 100644 --- a/tests/config/ci/glm45_dpo.yaml +++ b/tests/config/ci/glm45_dpo.yaml @@ -11,8 +11,8 @@ mix_strategy: concat ### model model_name_or_path: zai-org/GLM-4.5-Air-Base/ -#attn_impl: sdpa -attn_impl: flashmask +#_attn_implementation: sdpa +_attn_implementation: flashmask ### finetuning # base @@ -66,7 +66,5 @@ moe_router_force_load_balancing: true clear_every_step_cache: true partial_send_recv: false #use_cpu_initialization: true -fuse_attention_qkv: true -fuse_attention_ffn: true num_empty_layers_add_in_tail: 1 \ No newline at end of file diff --git a/tests/config/ci/glm45_lora.yaml b/tests/config/ci/glm45_lora.yaml index 5fc40e364f1..122e017e796 100644 --- a/tests/config/ci/glm45_lora.yaml +++ b/tests/config/ci/glm45_lora.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: ../zai-org/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -42,8 +42,6 @@ warmup_steps: 20 learning_rate: 1.0e-4 # performance -fuse_attention_qkv: true -fuse_attention_ffn: true moe_token_dispatcher_type: "deepep" gated_linear_unit: true tensor_model_parallel_size: 4 diff --git a/tests/config/ci/glm45_pt.yaml b/tests/config/ci/glm45_pt.yaml index 4c0b3f53665..de0e34c92a3 100644 --- a/tests/config/ci/glm45_pt.yaml +++ b/tests/config/ci/glm45_pt.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: ./GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -65,6 +65,4 @@ gated_linear_unit: true num_hidden_layers: 3 apply_rope_fusion: true moe_router_fusion: true -router_aux_loss_coef: 0.001 -fuse_attention_qkv: true -fuse_attention_ffn: true \ No newline at end of file +router_aux_loss_coef: 0.001 \ No newline at end of file diff --git a/tests/config/ci/glm45_pt_fp8.yaml b/tests/config/ci/glm45_pt_fp8.yaml index 4779482fa61..11e2cb76066 100644 --- a/tests/config/ci/glm45_pt_fp8.yaml +++ b/tests/config/ci/glm45_pt_fp8.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: ./GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -66,6 +66,4 @@ num_hidden_layers: 3 apply_rope_fusion: true moe_router_fusion: true router_aux_loss_coef: 0.001 -fp8: "e4m3" -fuse_attention_qkv: true -fuse_attention_ffn: true \ No newline at end of file +fp8: "e4m3" \ No newline at end of file diff --git a/tests/config/ci/glm45_pt_grouped_gemm.yaml b/tests/config/ci/glm45_pt_grouped_gemm.yaml index c4cf1478b61..b0977f7314e 100644 --- a/tests/config/ci/glm45_pt_grouped_gemm.yaml +++ b/tests/config/ci/glm45_pt_grouped_gemm.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: ./GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -66,8 +66,6 @@ num_hidden_layers: 3 apply_rope_fusion: true moe_router_fusion: true router_aux_loss_coef: 0.001 -fuse_attention_qkv: true -fuse_attention_ffn: true # grouped gemm moe_grouped_gemm: true diff --git a/tests/config/ci/glm45_sft.yaml b/tests/config/ci/glm45_sft.yaml index 066f15c2e94..7c8cd4939d6 100644 --- a/tests/config/ci/glm45_sft.yaml +++ b/tests/config/ci/glm45_sft.yaml @@ -11,7 +11,7 @@ mix_strategy: concat ### model model_name_or_path: ./zai-org/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -42,8 +42,6 @@ warmup_steps: 20 learning_rate: 1.0e-4 # performance -fuse_attention_qkv: true -fuse_attention_ffn: true moe_token_dispatcher_type: "deepep" gated_linear_unit: true tensor_model_parallel_size: 4 diff --git a/tests/config/ci/glm45_single_pt-test.yaml b/tests/config/ci/glm45_single_pt-test.yaml index 0bb12fcac3b..a44009636ea 100644 --- a/tests/config/ci/glm45_single_pt-test.yaml +++ b/tests/config/ci/glm45_single_pt-test.yaml @@ -9,7 +9,7 @@ split: "998,1,1" ### modelv model_name_or_path: /home/.cache/glm45/GLM-4.5-Air -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base diff --git a/tests/config/ci/qwen3_multicard_lora.yaml b/tests/config/ci/qwen3_multicard_lora.yaml index ebada0e2b99..474ec6350f8 100644 --- a/tests/config/ci/qwen3_multicard_lora.yaml +++ b/tests/config/ci/qwen3_multicard_lora.yaml @@ -17,7 +17,7 @@ mix_strategy: concat ### model model_name_or_path: ./checkpoints/qwen3-30b-a3b-sft gated_linear_unit: true -attn_impl: flashmask +_attn_implementation: flashmask lora: true lora_rank: 8 @@ -67,10 +67,7 @@ optim: adamw bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true fuse_swiglu: true -fuse_rms_norm: true fuse_linear: true use_paddlefleet: true use_qk_norm: true diff --git a/tests/config/ci/qwen3_multicard_pt.yaml b/tests/config/ci/qwen3_multicard_pt.yaml index f8b4509186b..aa91aff36bb 100644 --- a/tests/config/ci/qwen3_multicard_pt.yaml +++ b/tests/config/ci/qwen3_multicard_pt.yaml @@ -16,7 +16,7 @@ mix_strategy: concat ### model model_name_or_path: ./Qwen3-30B-A3B gated_linear_unit: true -attn_impl: flashmask +_attn_implementation: flashmask num_hidden_layers: 4 ### finetuning @@ -65,10 +65,7 @@ optim: adamw bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true fuse_swiglu: true -fuse_rms_norm: true fuse_linear: true use_paddlefleet: true use_qk_norm: true diff --git a/tests/config/ci/qwen3_multicard_sft.yaml b/tests/config/ci/qwen3_multicard_sft.yaml index 388c2b7cfab..ec5a8df0493 100644 --- a/tests/config/ci/qwen3_multicard_sft.yaml +++ b/tests/config/ci/qwen3_multicard_sft.yaml @@ -17,7 +17,7 @@ mix_strategy: concat ### model model_name_or_path: ./checkpoints/qwen3-30b-a3b-pt gated_linear_unit: true -attn_impl: flashmask +_attn_implementation: flashmask num_hidden_layers: 4 ### finetuning @@ -66,10 +66,7 @@ optim: adamw bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true fuse_swiglu: true -fuse_rms_norm: true fuse_linear: true use_paddlefleet: true use_qk_norm: true diff --git a/tests/config/ci/qwen3_pt.yaml b/tests/config/ci/qwen3_pt.yaml index ed92131055b..f50c8652947 100644 --- a/tests/config/ci/qwen3_pt.yaml +++ b/tests/config/ci/qwen3_pt.yaml @@ -12,7 +12,7 @@ mix_strategy: concat ### model model_name_or_path: ./Qwen3-30B-A3B-Base -attn_impl: flashmask +_attn_implementation: flashmask ### finetuning # base @@ -53,8 +53,6 @@ optim: adamw bf16: true fp16_opt_level: O2 amp_master_grad: true -fuse_attention_qkv: true -fuse_attention_ffn: true fuse_swiglu: true use_qk_norm: true diff --git a/tests/mergekit/test_merge_model.py b/tests/mergekit/test_merge_model.py index e71cb2aa171..3db7acbd8e3 100644 --- a/tests/mergekit/test_merge_model.py +++ b/tests/mergekit/test_merge_model.py @@ -145,8 +145,6 @@ def test_fuse_qkv_lora_merge_torch(self): from paddleformers.transformers import Qwen3Config, Qwen3ForCausalLM model_config = Qwen3Config.from_pretrained(torch_model_path) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True fused_base_model = Qwen3ForCausalLM.from_pretrained( torch_model_path, config=model_config, diff --git a/tests/peft/test_lora.py b/tests/peft/test_lora.py index a7679e6b2b0..c0b1cc78922 100644 --- a/tests/peft/test_lora.py +++ b/tests/peft/test_lora.py @@ -87,7 +87,7 @@ def test_load_regular_linear(self): class TestLoraModel(unittest.TestCase): def test_lora_model_restore(self): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -109,7 +109,7 @@ def test_lora_model_restore(self): @parameterized.expand([(None,), ("all",), ("lora",)]) def test_lora_model_constructor(self, bias): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -149,7 +149,7 @@ def test_lora_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, ) @@ -182,7 +182,7 @@ def test_lora_module_raise_exception(self): LoRAModel(model, lora_config) def test_lora_get_merge_state_dict(self): - lora_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8) + lora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8) model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True) model.eval() lora_model = LoRAModel(model, lora_config) @@ -201,7 +201,7 @@ def test_lora_get_merge_state_dict(self): self.assertIsInstance(merged_weight, paddle.Tensor) - if any(target in k for target in ["q_proj", "v_proj"]): + if any(target in k for target in ["qkv_proj"]): lora_A_key = k.replace("weight", "lora_A") lora_B_key = k.replace("weight", "lora_B") @@ -230,12 +230,15 @@ def test_lora_model_save_load_fc(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, ) model = Glm4MoeModel.from_pretrained( - "PaddleFormers/tiny-random-glm4moe", download_hub="aistudio", convert_from_hf=True + "PaddleFormers/tiny-random-glm4moe-bf16", + download_hub="aistudio", + convert_from_hf=True, + dtype="float32", ) lora_model = LoRAModel(model, lora_config) lora_model.eval() diff --git a/tests/peft/test_lorapro.py b/tests/peft/test_lorapro.py index 9dc5e2103a8..13fc110be49 100644 --- a/tests/peft/test_lorapro.py +++ b/tests/peft/test_lorapro.py @@ -93,7 +93,7 @@ def tearDown(self): def test_lorapro_model_restore(self): lorapro_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -116,7 +116,7 @@ def test_lorapro_model_restore(self): @parameterized.expand([(None,), ("all",), ("lora",)]) def test_lorapro_model_constructor(self, bias): lorapro_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -156,7 +156,7 @@ def test_lorapro_model_constructor(self, bias): def test_lorapro_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - lorapro_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lorapro=True) + lorapro_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, lorapro=True) model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True) lorapro_model = LoRAModel(model, lorapro_config) lorapro_model.eval() @@ -177,7 +177,7 @@ def test_lorapro_model_save_load(self): def test_lorapro_modes(self, x_mode): """Test if AdamWLoRAPro optimizer with different x_modes can perform optimization steps""" lorapro_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], diff --git a/tests/peft/test_mora.py b/tests/peft/test_mora.py index 9324d2f4913..0b53d038db4 100644 --- a/tests/peft/test_mora.py +++ b/tests/peft/test_mora.py @@ -94,7 +94,7 @@ def test_unmerge(self): class TestMoraModel(unittest.TestCase): def test_mora_model_restore(self): mora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -117,7 +117,7 @@ def test_mora_model_restore(self): @parameterized.expand([(None,), ("all",), ("lora",)]) def test_mora_model_constructor(self, bias): mora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -157,7 +157,7 @@ def test_mora_model_constructor(self, bias): def test_mora_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - mora_config = LoRAConfig(target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, use_mora=True) + mora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, use_mora=True) model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True) mora_model = LoRAModel(model, mora_config) mora_model.eval() diff --git a/tests/peft/test_mos_lora.py b/tests/peft/test_mos_lora.py index 929926959ec..e50f80f3cc2 100644 --- a/tests/peft/test_mos_lora.py +++ b/tests/peft/test_mos_lora.py @@ -97,7 +97,7 @@ def test_unmerge(self): class TestMosLoraModel(unittest.TestCase): def test_lora_model_restore(self): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -119,7 +119,7 @@ def test_lora_model_restore(self): def test_parallel_support(self): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -135,7 +135,7 @@ def test_parallel_support(self): @parameterized.expand([(None,), ("all",), ("lora",)]) def test_lora_model_constructor(self, bias): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, enable_lora_list=[None, [True, False]], @@ -175,9 +175,7 @@ def test_lora_model_constructor(self, bias): def test_lora_model_save_load(self): with TemporaryDirectory() as tempdir: input_ids = paddle.to_tensor(np.random.randint(100, 200, [1, 20])) - lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], r=4, lora_alpha=8, lora_use_mixer=True - ) + lora_config = LoRAConfig(target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, lora_use_mixer=True) model = AutoModelForCausalLM.from_pretrained("PaddleFormers/tiny-random-qwen3", convert_from_hf=True) lora_model = LoRAModel(model, lora_config) lora_model.eval() diff --git a/tests/peft/test_quant_lora.py b/tests/peft/test_quant_lora.py index db2ef08a3e2..bfbae40d538 100644 --- a/tests/peft/test_quant_lora.py +++ b/tests/peft/test_quant_lora.py @@ -98,7 +98,7 @@ class TestQuantedLoRAModel(unittest.TestCase): @classmethod def setUpClass(cls): lora_config = LoRAConfig( - target_modules=[".*q_proj.*", ".*v_proj.*"], + target_modules=[".*qkv_proj.*"], r=4, lora_alpha=8, ) @@ -128,8 +128,8 @@ def test_count_model_layers(self): self.lora_model.train() quant_lora_model = qat.quantize(self.lora_model, inplace=False) quantizer_cnt = self._count_layers(quant_lora_model, FakeQuanterWithAbsMaxObserverLayer) - # 2 LoRA layers (q_proj, v_proj) per transformer layer - self.assertEqual(quantizer_cnt, 2 * self.model.config.num_hidden_layers) + # 2 LoRA layers (qkv_proj) per transformer layer + self.assertEqual(quantizer_cnt, self.model.config.num_hidden_layers) def test_forward_no_quant(self): q_config = QuantConfig(activation=None, weight=None) diff --git a/tests/trainer/test_unified_checkpoint.py b/tests/trainer/test_unified_checkpoint.py index 12b63aae806..59c0033f657 100644 --- a/tests/trainer/test_unified_checkpoint.py +++ b/tests/trainer/test_unified_checkpoint.py @@ -66,7 +66,6 @@ "sharding": "", "virtual_pipeline_model_parallel_size": 1, "sequence_parallel": 0, - "fuse_rms_norm": "false", "max_seq_len": 1024, "learning_rate": 3e-04, "min_learning_rate": 1e-05, diff --git a/tests/transformers/auto/test_configuration.py b/tests/transformers/auto/test_configuration.py index a2326b8be22..522a6a35ba2 100644 --- a/tests/transformers/auto/test_configuration.py +++ b/tests/transformers/auto/test_configuration.py @@ -124,8 +124,6 @@ def test_load_from_custom_arch(self): "bos_token_id": 1, "do_normalize": False, "eos_token_id": 2, - "fuse_attention_ffn": False, - "fuse_attention_qkv": False, "fuse_sequence_parallel_allreduce": False, "hidden_act": "silu", "hidden_size": 4096, @@ -151,7 +149,6 @@ def test_load_from_custom_arch(self): "tensor_parallel_output": True, "tie_word_embeddings": False, "transformers_version": "4.28.1", - "fuse_rms_norm": False, "apply_rope_fusion": False, "use_recompute": False, "virtual_pipeline_model_parallel_size": 1, diff --git a/tests/transformers/deepseek_v3/test_modeling.py b/tests/transformers/deepseek_v3/test_modeling.py index a3139d06c11..ce41a1fc499 100644 --- a/tests/transformers/deepseek_v3/test_modeling.py +++ b/tests/transformers/deepseek_v3/test_modeling.py @@ -72,7 +72,7 @@ def __init__( num_labels=3, num_choices=4, pad_token_id=0, - aux_loss_alpha=0.001, + router_aux_loss_coef=0.001, first_k_dense_replace=1, hidden_act="silu", scope=None, @@ -104,7 +104,7 @@ def __init__( self.num_experts_per_tok = num_experts_per_tok self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob - self.aux_loss_alpha = aux_loss_alpha + self.router_aux_loss_coef = router_aux_loss_coef self.hidden_act = hidden_act self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range @@ -166,7 +166,7 @@ def get_config(self) -> DeepseekV3Config: num_experts_per_tok=self.num_experts_per_tok, first_k_dense_replace=self.first_k_dense_replace, norm_topk_prob=self.norm_topk_prob, - aux_loss_alpha=self.aux_loss_alpha, + router_aux_loss_coef=self.router_aux_loss_coef, hidden_act=self.hidden_act, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, diff --git a/tests/transformers/ernie4_5/test_modeling.py b/tests/transformers/ernie4_5/test_modeling.py index 6d3c1585cd3..7b14020fe2a 100644 --- a/tests/transformers/ernie4_5/test_modeling.py +++ b/tests/transformers/ernie4_5/test_modeling.py @@ -518,49 +518,3 @@ def test_ernie4_5_converter_from_local_dir(self): rtol=1e-2, ) ) - - # 4. forward with fc - from paddleformers.transformers import Ernie4_5Config, Ernie4_5ForCausalLM - - uc_load_model = Ernie4_5ForCausalLM.from_pretrained( - self.torch_model_path, - convert_from_hf=True, - dtype="float32", - load_checkpoint_format="", - ) - fc_load_model = Ernie4_5ForCausalLM.from_pretrained( - self.torch_model_path, dtype="float32", load_checkpoint_format="flex_checkpoint" - ) - uc_load_model.eval() - fc_load_model.eval() - uc_logit = uc_load_model(paddle.to_tensor(input_ids))[0] - fc_logit = fc_load_model(paddle.to_tensor(input_ids))[0] - self.assertTrue( - np.allclose( - uc_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(), - fc_logit.detach().cpu().reshape([-1])[:9].float().numpy(), - atol=1e-5, - rtol=1e-5, - ) - ) - - # 5. fuse qkv/ffn with fc - model_config = Ernie4_5Config.from_pretrained(self.torch_model_path) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True - fc_fused_load_model = Ernie4_5ForCausalLM.from_pretrained( - self.torch_model_path, - config=model_config, - dtype="float32", - load_checkpoint_format="flex_checkpoint", - ) - fc_fused_load_model.eval() - fc_fused_logit = fc_fused_load_model(paddle.to_tensor(input_ids))[0] - self.assertTrue( - np.allclose( - fc_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(), - fc_fused_logit.detach().cpu().reshape([-1])[:9].astype("float32").numpy(), - atol=1e-5, - rtol=1e-5, - ) - ) diff --git a/tests/transformers/gemma3_text/test_modeling.py b/tests/transformers/gemma3_text/test_modeling.py index 5d8735b908b..b774252e49c 100644 --- a/tests/transformers/gemma3_text/test_modeling.py +++ b/tests/transformers/gemma3_text/test_modeling.py @@ -324,8 +324,6 @@ def create_and_check_tp(self, config, input_ids, input_mask, *args): Gemma3ForCausalLM(config) def create_and_check_fuse_attn(self, config, input_ids, input_mask, *args): - config.fuse_attention_qkv = True - config.fuse_attention_ffn = True model = Gemma3ForCausalLM(config) model.eval() diff --git a/tests/transformers/glm4_moe/test_modeling.py b/tests/transformers/glm4_moe/test_modeling.py index 037b592eb66..8baa856a11f 100644 --- a/tests/transformers/glm4_moe/test_modeling.py +++ b/tests/transformers/glm4_moe/test_modeling.py @@ -378,37 +378,27 @@ def test_save_load(self): for model_class in self.all_model_classes: # test from_pretrained model1 = model_class.from_pretrained( - "PaddleFormers/tiny-random-glm4moe", + "PaddleFormers/tiny-random-glm4moe-bf16", download_hub="aistudio", - convert_from_hf=True, - load_checkpoint_format="", + load_checkpoint_format="flex_checkpoint", ) - - model2 = model_class.from_pretrained( - "PaddleFormers/tiny-random-glm4moe", download_hub="aistudio", load_checkpoint_format="flex_checkpoint" - ) - model_state_1 = model1.state_dict() - model_state_2 = model2.state_dict() - - for k, v in model_state_1.items(): - md51 = v._md5sum() - md52 = model_state_2[k]._md5sum() - assert md51 == md52 # test save_pretrained with tempfile.TemporaryDirectory() as tmpdirname: - model2.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint") - model3 = model_class.from_pretrained(tmpdirname, convert_from_hf=True, load_checkpoint_format="") - model_state_3 = model3.state_dict() + model1.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint") + model2 = model_class.from_pretrained( + tmpdirname, convert_from_hf=True, load_checkpoint_format="flex_checkpoint" + ) + model_state_2 = model2.state_dict() - for k, v in model_state_3.items(): - md53 = v._md5sum() - md52 = model_state_2[k]._md5sum() + for k, v in model_state_2.items(): + md52 = v._md5sum() + md51 = model_state_1[k]._md5sum() if k.endswith(".mlp.gate.weight"): + md51 = model_state_1[k].cast("bfloat16")._md5sum() md52 = model_state_2[k].cast("bfloat16")._md5sum() - md53 = model_state_3[k].cast("bfloat16")._md5sum() - assert md52 == md53 + assert md51 == md52 def test_hidden_states_output(self): pass diff --git a/tests/transformers/qwen2/test_modeling.py b/tests/transformers/qwen2/test_modeling.py index 601eef63014..3531a17276c 100644 --- a/tests/transformers/qwen2/test_modeling.py +++ b/tests/transformers/qwen2/test_modeling.py @@ -480,8 +480,6 @@ def test_Qwen2_converter_from_local_dir(self): # 4. fuse qkv/ffn with fc model_config = Qwen2Config.from_pretrained(tempdir) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True paddle_model_fused = Qwen2ForCausalLM.from_pretrained( tempdir, config=model_config, diff --git a/tests/transformers/qwen2_5_vl/test_modeling.py b/tests/transformers/qwen2_5_vl/test_modeling.py index 27bfcd87985..11f31754019 100644 --- a/tests/transformers/qwen2_5_vl/test_modeling.py +++ b/tests/transformers/qwen2_5_vl/test_modeling.py @@ -500,38 +500,6 @@ def test_sample_generate(self): else: self.assertTrue(output_generate[0].shape[1] == self.max_new_tokens + inputs_dict["input_ids"].shape[1]) - def test_save_load_flex_checkpoint(self): - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmpdirname: - tiny_vision_config = { - "depth": 4, - "intermediate_size": 64, - "hidden_size": 64, - "out_hidden_size": 128, - "fullatt_block_indexes": [1], - } - config = Qwen2_5_VLConfig( - num_hidden_layers=4, - intermediate_size=256, - hidden_size=128, - tie_word_embedding=False, - vision_config=tiny_vision_config, - ) - model = model_class(config) - model.save_pretrained(tmpdirname, save_checkpoint_format="flex_checkpoint") - - model1 = model_class.from_pretrained(tmpdirname, convert_from_hf=True, load_checkpoint_format="") - - model2 = model_class.from_pretrained(tmpdirname, load_checkpoint_format="flex_checkpoint") - - model_state_1 = model1.state_dict() - model_state_2 = model2.state_dict() - - for k, v in model_state_1.items(): - md51 = v._md5sum() - md52 = model_state_2[k]._md5sum() - assert md51 == md52 - class Qwen2_5_VLIntegrationTest(unittest.TestCase): def setUp(self): @@ -956,8 +924,6 @@ def test_Qwen2_5_VL_classes_from_local_dir(self, class_name, pytorch_class_name: paddle_model_fused = paddle_model_class.from_pretrained( tempdir, dtype="float32", - fuse_attention_qkv=True, - fuse_attention_ffn=True, load_checkpoint_format="flex_checkpoint", ).eval() diff --git a/tests/transformers/qwen2moe/test_modeling.py b/tests/transformers/qwen2moe/test_modeling.py index 23692865605..b19f6720ff0 100644 --- a/tests/transformers/qwen2moe/test_modeling.py +++ b/tests/transformers/qwen2moe/test_modeling.py @@ -442,8 +442,6 @@ def test_Qwen2Moe_converter_from_local_dir(self): # 4. fuse qkv/ffn with fc model_config = Qwen2MoeConfig.from_pretrained(tempdir) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True paddle_model_fused = Qwen2MoeForCausalLM.from_pretrained( tempdir, config=model_config, diff --git a/tests/transformers/qwen3/test_modeling.py b/tests/transformers/qwen3/test_modeling.py index a644458063c..4c034460375 100644 --- a/tests/transformers/qwen3/test_modeling.py +++ b/tests/transformers/qwen3/test_modeling.py @@ -484,8 +484,6 @@ def test_Qwen3_converter_from_local_dir(self): # 4. fuse qkv/ffn with fc model_config = Qwen3Config.from_pretrained(tempdir) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True paddle_model_fused = Qwen3ForCausalLM.from_pretrained( tempdir, config=model_config, diff --git a/tests/transformers/qwen3_vl/test_modeling.py b/tests/transformers/qwen3_vl/test_modeling.py index b61e3a3b446..bca227b1aee 100644 --- a/tests/transformers/qwen3_vl/test_modeling.py +++ b/tests/transformers/qwen3_vl/test_modeling.py @@ -1003,8 +1003,6 @@ def test_Qwen3VL_classes_from_local_dir(self, class_name, pytorch_class_name: st paddle_model_fused = paddle_model_class.from_pretrained( tempdir, dtype="float32", - fuse_attention_qkv=True, - fuse_attention_ffn=True, load_checkpoint_format="flex_checkpoint", ).eval() diff --git a/tests/transformers/qwen3_vl_moe/test_modeling.py b/tests/transformers/qwen3_vl_moe/test_modeling.py index 0b97867ad7b..339d4c0a19e 100644 --- a/tests/transformers/qwen3_vl_moe/test_modeling.py +++ b/tests/transformers/qwen3_vl_moe/test_modeling.py @@ -1040,8 +1040,6 @@ def test_Qwen3VLMoe_classes_from_local_dir(self, class_name, pytorch_class_name: paddle_model_fused = paddle_model_class.from_pretrained( tempdir, dtype="float32", - fuse_attention_qkv=True, - fuse_attention_ffn=True, load_checkpoint_format="flex_checkpoint", ).eval() diff --git a/tests/transformers/qwen3moe/test_modeling.py b/tests/transformers/qwen3moe/test_modeling.py index 17a5a55263a..6c897d1fb6c 100644 --- a/tests/transformers/qwen3moe/test_modeling.py +++ b/tests/transformers/qwen3moe/test_modeling.py @@ -444,8 +444,6 @@ def test_Qwen3Moe_converter_from_local_dir(self): # 4. fuse qkv/ffn with fc model_config = Qwen3MoeConfig.from_pretrained(tempdir) - model_config.fuse_attention_qkv = True - model_config.fuse_attention_ffn = True paddle_model_fused = Qwen3MoeForCausalLM.from_pretrained( tempdir, config=model_config, diff --git a/tests/transformers/test_configuration_utils.py b/tests/transformers/test_configuration_utils.py index d8f3bebd048..fc0341932ed 100644 --- a/tests/transformers/test_configuration_utils.py +++ b/tests/transformers/test_configuration_utils.py @@ -89,8 +89,6 @@ def test_parse_config_with_single_config(self): def test_model_config_save(self): # 1. single config config = FakeSimplePretrainedModelConfig(a=10, b=11, c=12) - config.fuse_attention_qkv = True - config.fuse_rms_norm = True config.tensor_model_parallel_size = 8 config.tensor_parallel_output = True @@ -107,8 +105,6 @@ def test_model_config_save(self): import json loaded_config = json.load(open(os.path.join(tp, "config.json"), "r")) - assert "fuse_attention_qkv" in loaded_config, "fuse qkv is need to save" - assert "fuse_rms_norm" not in loaded_config, "fuse_rms_norm don't need to save" assert "tensor_model_parallel_size" in loaded_config, "tensor_model_parallel_size need to save" assert "paddleformers_version" in loaded_config, "always save paddleformers_version" assert ( diff --git a/tests/transformers/test_conversion_common.py b/tests/transformers/test_conversion_common.py index 018e018ccf6..18c9ce35ad1 100644 --- a/tests/transformers/test_conversion_common.py +++ b/tests/transformers/test_conversion_common.py @@ -17,7 +17,6 @@ import glob import os import tempfile -import unittest import paddle @@ -236,17 +235,3 @@ def forward(self, input_ids): config_fast_ffn.convert_fast_ffn = True common_test_save_and_load(config_no_fast_ffn, config_fast_ffn, TestForCausalLM) - - -from paddleformers.transformers import LlamaConfig, LlamaForCausalLM - - -class TestFuseOrSplit(unittest.TestCase): - def test_model_split_to_fuse(self): - _test_split_to_fuse(LlamaConfig, LlamaForCausalLM) - - def test_model_fuse_to_split(self): - _test_fuse_to_split(LlamaConfig, LlamaForCausalLM) - - def test_model_convert_fast_ffn(self): - _test_fast_ffn() diff --git a/tests/transformers/test_shard_checkpoint.py b/tests/transformers/test_shard_checkpoint.py deleted file mode 100644 index e09bfbe2ad1..00000000000 --- a/tests/transformers/test_shard_checkpoint.py +++ /dev/null @@ -1,486 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import tempfile -import unittest - -import paddle - -from paddleformers.transformers import ( - AutoConfig, - AutoModelForCausalLM, - LlamaModel, - PretrainedConfig, - PretrainedModel, - Qwen3Model, - register_base_model, -) -from paddleformers.transformers.model_utils import ( - load_sharded_checkpoint, - shard_checkpoint, -) -from paddleformers.utils.env import ( - PADDLE_WEIGHTS_INDEX_NAME, - PADDLE_WEIGHTS_NAME, - SAFE_WEIGHTS_INDEX_NAME, - SAFE_WEIGHTS_NAME, -) -from paddleformers.utils.import_utils import is_paddle_cuda_available -from tests.testing_utils import require_package - - -class FakeConfig(PretrainedConfig): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - -class FakePretrainedModel(PretrainedModel): - config_class = FakeConfig - - _keep_in_fp32_modules = ["norm."] - - -@register_base_model -class FakeModel(FakePretrainedModel): - def __init__(self, config): - super(FakeModel, self).__init__(config) - self.linear = paddle.nn.Linear(2, 3) - self.norm = paddle.nn.LayerNorm(2) - - -class TestFromPretrained(unittest.TestCase): - def test_from_pretrained_low_cpu_mem_usage_functional(self): - # test that we can use `from_pretrained(..., low_cpu_mem_usage=True)` with normal and - # sharded models - mnames = [ - "Paddleformers/tiny-random-llama3-shard", - "Paddleformers/tiny-random-llama3", - ] - convert_from_hf = [False, True] - for mname, convert in zip(mnames, convert_from_hf): - m1 = LlamaModel.from_pretrained( - mname, - low_cpu_mem_usage=True, - convert_from_hf=convert, - load_checkpoint_format="", - ) - m2 = LlamaModel.from_pretrained( - mname, - low_cpu_mem_usage=False, - convert_from_hf=convert, - load_checkpoint_format="", - ) - for p1, p2 in zip(m1.parameters(), m2.parameters()): - self.assertTrue(paddle.allclose(p1.float(), p2.float())) - - @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode") - def test_keep_in_fp32_modules(self): - with tempfile.TemporaryDirectory() as tempdir: - config = PretrainedConfig() - model = FakeModel.from_config(config, dtype="float16") - model.config = config - model.save_pretrained(tempdir, save_to_hf=False, save_checkpoint_format="") - - # check model_state.pdparams - state_dict = paddle.load(os.path.join(tempdir, "model_state.pdparams")) - - self.assertEqual(state_dict["linear.weight"].dtype, paddle.float16) - self.assertEqual(state_dict["norm.weight"].dtype, paddle.float16) - - new_model = FakeModel.from_pretrained(tempdir, convert_from_hf=False, load_checkpoint_format="") - self.assertEqual(new_model.linear.weight.dtype, paddle.float16) - self.assertEqual(new_model.norm.weight.dtype, paddle.float32) - - def test_load_sharded_checkpoint(self): - config = AutoConfig.from_pretrained("Paddleformers/tiny-random-llama3-shard") - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama3-shard", - convert_from_hf=False, - load_checkpoint_format="", - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(tmp_dir, max_shard_size="200kiB", save_to_hf=False, save_checkpoint_format="") - model_load = LlamaModel.from_config(config) - missing_keys, unexpected_keys = load_sharded_checkpoint(model_load, tmp_dir) - - self.assertEqual(missing_keys, []) - self.assertEqual(unexpected_keys, []) - for p1, p2 in zip(model.parameters(), model_load.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode") - def test_load_from_torch_dtyp_cast(self): - pass - - @unittest.skipIf(not is_paddle_cuda_available(), "some op is missing in cpu mode") - def test_load_dtype_cast(self): - dtype_prefix_len = len("paddle.") - - def inner_convert_test(src_dtype, dst_dtype): - str_src_dtype = str(src_dtype)[dtype_prefix_len:] - str_dst_dtype = str(dst_dtype)[dtype_prefix_len:] - - config = AutoConfig.from_pretrained("PaddleFormers/tiny-random-qwen3") - model = Qwen3Model.from_config(config, dtype=str_src_dtype) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(tmp_dir, save_to_hf=False, save_checkpoint_format="") - new_model = Qwen3Model.from_pretrained( - tmp_dir, dtype=str_dst_dtype, convert_from_hf=False, load_checkpoint_format="" - ) - - for k, v in model.state_dict().items(): - if v.is_floating_point(): - self.assertEqual(v.dtype, src_dtype) - for k, v in new_model.state_dict().items(): - if v.is_floating_point(): - self.assertEqual(v.dtype, dst_dtype) - - with self.subTest("paddle.float32 to paddle.float16"): - inner_convert_test(paddle.float32, paddle.float16) - with self.subTest("paddle.float32 to paddle.bfloat16"): - inner_convert_test(paddle.float32, paddle.bfloat16) - with self.subTest("paddle.float16 to paddle.float32"): - inner_convert_test(paddle.float16, paddle.float32) - with self.subTest("paddle.float16 to paddle.bfloat16"): - inner_convert_test(paddle.float16, paddle.bfloat16) - with self.subTest("paddle.bfloat16 to paddle.float32"): - inner_convert_test(paddle.bfloat16, paddle.float32) - with self.subTest("paddle.bfloat16 to paddle.float16"): - inner_convert_test(paddle.bfloat16, paddle.float16) - - -class TestShardCheckpoint(unittest.TestCase): - def test_shard_checkpoint(self): - # This is the model we will use, total size 340,000 bytes. - model = paddle.nn.Sequential( - paddle.nn.Linear(100, 200, bias_attr=False), # size 80,000 - paddle.nn.Linear(200, 200, bias_attr=False), # size 160,000 - paddle.nn.Linear(200, 100, bias_attr=False), # size 80,000 - paddle.nn.Linear(100, 50, bias_attr=False), # size 20,000 - ) - state_dict = model.state_dict() - - with self.subTest("No shard when max size is bigger than model size"): - shards, index = shard_checkpoint(state_dict) - self.assertIsNone(index) - self.assertDictEqual(shards, {PADDLE_WEIGHTS_NAME: state_dict}) - - with self.subTest("Test sharding, no weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="300kB") - # Split is first two layers then last two. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "model_state-00001-of-00002.pdparams", - "1.weight": "model_state-00001-of-00002.pdparams", - "2.weight": "model_state-00002-of-00002.pdparams", - "3.weight": "model_state-00002-of-00002.pdparams", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]} - shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, {"model_state-00001-of-00002.pdparams": shard1, "model_state-00002-of-00002.pdparams": shard2} - ) - - with self.subTest("Test sharding with weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="100kB") - # Split is first layer, second layer then last 2. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "model_state-00001-of-00003.pdparams", - "1.weight": "model_state-00002-of-00003.pdparams", - "2.weight": "model_state-00003-of-00003.pdparams", - "3.weight": "model_state-00003-of-00003.pdparams", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"]} - shard2 = {"1.weight": state_dict["1.weight"]} - shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, - { - "model_state-00001-of-00003.pdparams": shard1, - "model_state-00002-of-00003.pdparams": shard2, - "model_state-00003-of-00003.pdparams": shard3, - }, - ) - - def test_checkpoint_sharding_local(self): - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama3-shard", - convert_from_hf=False, - load_checkpoint_format="", - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - # We use the same folder for various sizes to make sure a new save erases the old checkpoint. - for max_size in ["50kB", "50kiB", "100kB", "100kiB", "200kB", "200kiB"]: - model.save_pretrained(tmp_dir, max_shard_size=max_size, save_to_hf=False, save_checkpoint_format="") - - # Get each shard file and its size - shard_to_size = {} - for shard in os.listdir(tmp_dir): - if shard.endswith(".pdparams"): - shard_file = os.path.join(tmp_dir, shard) - shard_to_size[shard_file] = os.path.getsize(shard_file) - - index_file = os.path.join(tmp_dir, PADDLE_WEIGHTS_INDEX_NAME) - # Check there is an index but no regular weight file - self.assertTrue(os.path.isfile(index_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME))) - - # Check a file is bigger than max_size only when it has a single weight - for shard_file, size in shard_to_size.items(): - if max_size.endswith("kiB"): - max_size_int = int(max_size[:-3]) * 2**10 - else: - max_size_int = int(max_size[:-2]) * 10**3 - # Note: pickle adds some junk so the weight of the file can end up being slightly bigger than - # the size asked for (since we count parameters) - if size >= max_size_int + 50000: - state_dict = paddle.load(shard_file) - self.assertEqual(len(state_dict), 1) - - # Check the index and the shard files found match - with open(index_file, "r", encoding="utf-8") as f: - index = json.loads(f.read()) - - all_shards = set(index["weight_map"].values()) - shards_found = {f for f in os.listdir(tmp_dir) if f.endswith(".pdparams")} - self.assertSetEqual(all_shards, shards_found) - - # Finally, check the model can be reloaded - new_model = LlamaModel.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="") - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - def test_checkpoint_sharding_from_hub(self): - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama3-shard", - convert_from_hf=False, - load_checkpoint_format="", - ) - - # the model above is the same as the model below, just a sharded version. - ref_model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama3-shard", - convert_from_hf=False, - load_checkpoint_format="", - ) - for p1, p2 in zip(model.parameters(), ref_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - def test_checkpoint_variant_local(self): - model = AutoModelForCausalLM.from_pretrained( - "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format="" - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained(tmp_dir, variant="v2", save_to_hf=False, save_checkpoint_format="") - - weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"]) - - weights_file = os.path.join(tmp_dir, weights_name) - self.assertTrue(os.path.isfile(weights_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME))) - - with self.assertRaises(EnvironmentError): - _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="") - - new_model = Qwen3Model.from_pretrained( - tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format="" - ) - - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - def test_checkpoint_variant_local_sharded(self): - model = AutoModelForCausalLM.from_pretrained( - "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format="" - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - tmp_dir, variant="v2", max_shard_size="50kB", save_to_hf=False, save_checkpoint_format="" - ) - - weights_index_name = ".".join(PADDLE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"]) - weights_index_file = os.path.join(tmp_dir, weights_index_name) - self.assertTrue(os.path.isfile(weights_index_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_INDEX_NAME))) - - for i in range(1, 10): - weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00020"] + ["pdparams"]) - weights_name_file = os.path.join(tmp_dir, weights_name) - self.assertTrue(os.path.isfile(weights_name_file)) - - for i in range(10, 21): - weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-000{i}-of-00020"] + ["pdparams"]) - weights_name_file = os.path.join(tmp_dir, weights_name) - self.assertTrue(os.path.isfile(weights_name_file)) - - with self.assertRaises(EnvironmentError): - _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="") - - new_model = Qwen3Model.from_pretrained( - tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format="" - ) - - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - @require_package("safetensors") - def test_checkpoint_variant_local_safe(self): - model = AutoModelForCausalLM.from_pretrained( - "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format="" - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - tmp_dir, variant="v2", safe_serialization=True, save_to_hf=False, save_checkpoint_format="" - ) - - weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["safetensors"]) - - weights_file = os.path.join(tmp_dir, weights_name) - - self.assertTrue(os.path.isfile(weights_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_NAME))) - - with self.assertRaises(EnvironmentError): - _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="") - - new_model = Qwen3Model.from_pretrained( - tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format="" - ) - - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - @require_package("safetensors") - def test_checkpoint_variant_local_sharded_safe(self): - model = AutoModelForCausalLM.from_pretrained( - "PaddleFormers/tiny-random-qwen3", convert_from_hf=True, load_checkpoint_format="" - ) - - with tempfile.TemporaryDirectory() as tmp_dir: - model.save_pretrained( - tmp_dir, - variant="v2", - max_shard_size="50kB", - safe_serialization=True, - save_to_hf=False, - save_checkpoint_format="", - ) - - weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["v2"] + ["json"]) - weights_index_file = os.path.join(tmp_dir, weights_index_name) - self.assertTrue(os.path.isfile(weights_index_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) - - for i in range(1, 10): - weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00020"] + ["safetensors"]) - weights_name_file = os.path.join(tmp_dir, weights_name) - self.assertTrue(os.path.isfile(weights_name_file)) - - for i in range(10, 21): - weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-000{i}-of-00020"] + ["safetensors"]) - weights_name_file = os.path.join(tmp_dir, weights_name) - self.assertTrue(os.path.isfile(weights_name_file)) - - with self.assertRaises(EnvironmentError): - _ = Qwen3Model.from_pretrained(tmp_dir, convert_from_hf=False, load_checkpoint_format="") - - new_model = Qwen3Model.from_pretrained( - tmp_dir, variant="v2", convert_from_hf=False, load_checkpoint_format="" - ) - - for p1, p2 in zip(model.parameters(), new_model.parameters()): - self.assertTrue(paddle.allclose(p1, p2)) - - def test_checkpoint_variant_hub(self): - with tempfile.TemporaryDirectory() as tmp_dir: - with self.assertRaises(EnvironmentError): - _ = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama-variant", - cache_dir=tmp_dir, - convert_from_hf=False, - load_checkpoint_format="", - ) - - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama-variant", - cache_dir=tmp_dir, - variant="v2", - convert_from_hf=False, - load_checkpoint_format="", - ) - self.assertIsNotNone(model) - - def test_checkpoint_variant_hub_sharded(self): - with tempfile.TemporaryDirectory() as tmp_dir: - with self.assertRaises(EnvironmentError): - _ = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama-variant-sharded", - cache_dir=tmp_dir, - convert_from_hf=False, - load_checkpoint_format="", - ) - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama-variant-sharded", - cache_dir=tmp_dir, - variant="v2", - convert_from_hf=False, - load_checkpoint_format="", - ) - self.assertIsNotNone(model) - - def test_checkpoint_variant_save_load(self): - with tempfile.TemporaryDirectory() as tmp_dir: - model = LlamaModel.from_pretrained( - "Paddleformers/tiny-random-llama-variant", - cache_dir=tmp_dir, - variant="v2", - convert_from_hf=False, - load_checkpoint_format="", - ) - weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"]) - - model.save_pretrained(tmp_dir, variant="v2", save_to_hf=False, save_checkpoint_format="") - # saving will create a variant checkpoint - self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name))) - - model.save_pretrained(tmp_dir, save_to_hf=False, save_checkpoint_format="") - # saving shouldn't delete variant checkpoints - weights_name = ".".join(PADDLE_WEIGHTS_NAME.split(".")[:-1] + ["v2"] + ["pdparams"]) - self.assertTrue(os.path.isfile(os.path.join(tmp_dir, weights_name))) - - # there should be a normal checkpoint - self.assertTrue(os.path.isfile(os.path.join(tmp_dir, PADDLE_WEIGHTS_NAME))) - - self.assertIsNotNone(model)