-
Notifications
You must be signed in to change notification settings - Fork 680
Open
Labels
questionFurther information is requestedFurther information is requested
Description
Bug description
I am trying to launch a 48 nodes of training, but they are stuck at downloading the C4 dataset due to a download limit from huggingface. Are there any way to get around with this? I am using pp24, ep16, dp-ep1.
Rate limited. Waiting 196.0s before retry [Retry 1/5].
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly.
sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly.
sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
HTTP Error 429 thrown while requesting GET https://huggingface.co/api/datasets/allenai/c4/tree/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en?expand=false&recursive=true&limit=1000&cursor=ZXlKbWFXeGxYMjVoYldVaU9pSmxiaTlqTkMxMGNtRnBiaTR3TURrNU9TMXZaaTB3TVRBeU5DNXFjMjl1TG1kNklpd2lkSEpsWlY5dmFXUWlPaUprWXprME0yTTBZelF3WmpVelpEQXlZak14WTJWa01XUmxabUUzWlRWbU5ETTRaRFU0TmpKbEluMD06MTAwMA%3D%3D
Rate limited. Waiting 196.0s before retry [Retry 1/5].
[rank356]: Traceback (most recent call last):
[rank356]: File "<frozen runpy>", line 198, in _run_module_as_main
[rank356]: File "<frozen runpy>", line 88, in _run_code
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 768, in <module>
[rank356]: main(Trainer)
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 736, in main
[rank356]: trainer = trainer_class(config)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
[rank356]: return f(*args, **kwargs)
[rank356]: ^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 127, in __init__
[rank356]: self.dataloader = self.train_spec.build_dataloader_fn(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 181, in build_text_dataloader
[rank356]: hf_ds = HuggingFaceTextDataset(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 87, in __init__
[rank356]: ds = dataset_loader(path)
[rank356]: ^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 26, in _load_c4_dataset
[rank356]: return load_dataset(dataset_path, name="en", split=split, streaming=True)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1397, in load_dataset
[rank356]: builder_instance = load_dataset_builder(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1137, in load_dataset_builder
[rank356]: dataset_module = dataset_module_factory(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1036, in dataset_module_factory
[rank356]: raise e1 from None
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1009, in dataset_module_factory
[rank356]: ).get_module()
[rank356]: ^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 633, in get_module
[rank356]: data_files = DataFilesDict.from_patterns(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 705, in from_patterns
[rank356]: else DataFilesList.from_patterns(
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 598, in from_patterns
[rank356]: resolve_pattern(
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 364, in resolve_pattern
[rank356]: for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 614, in glob
[rank356]: return super().glob(path, maxdepth=maxdepth, **kwargs)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/fsspec/spec.py", line 642, in glob
[rank356]: allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 670, in find
[rank356]: path_info = self.info(path, revision=resolved_path.revision, **kwargs)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 801, in info
[rank356]: self.ls(parent_path)
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 452, in ls
[rank356]: out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
[rank356]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 557, in _ls_tree
[rank356]: for path_info in tree:
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 3156, in list_repo_tree
[rank356]: for path_info in paginate(path=tree_url, headers=headers, params={"recursive": recursive, "expand": expand}):
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_pagination.py", line 37, in paginate
[rank356]: hf_raise_for_status(r)
[rank356]: File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 743, in hf_raise_for_status
[rank356]: raise _format(HfHubHTTPError, message, response) from e
[rank356]: huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-696940cb-4d8687522d2e15255beae225;86d7d923-d927-48ba-a0ed-fcd6323e126e)
Versions
I am using rocm torch 7.0.2.
toml:
[job]
dump_folder = "./outputs/titan_173B_titan_N48_PP24_EP16_MBS1_NBS30"
description = "DeepSeek-V3 173B_titan training"
print_config = false
[profiling]
enable_profiling = false
save_traces_folder = "profile_trace"
profile_freq = 10
enable_memory_snapshot = false
save_memory_snapshot_folder = "memory_snapshot"
[metrics]
log_freq = 1
disable_color_printing = false
enable_tensorboard = false
save_tb_folder = "tb"
enable_wandb = false
[model]
name = "deepseek_v3"
flavor = "173B_titan"
hf_assets_path = "./assets/hf/deepseek-moe-16b-base"
[optimizer]
name = "AdamW"
lr = 2.2e-4
eps = 1e-8
[lr_scheduler]
warmup_steps = 200
decay_ratio = 0.8
decay_type = "cosine"
min_lr_factor = 0.1
[training]
local_batch_size = 30
#local_batch_size = pipeline_parallel_microbatch_size * num_batches
global_batch_size = 480
seq_len = 4096
max_norm = 1.0
steps = 5
dataset = "c4"
[parallelism]
# DP Settings (Fixed)
data_parallel_replicate_degree = 1
data_parallel_shard_degree = -1
fsdp_reshard_after_forward = "default"
# Variable Settings
tensor_parallel_degree = 1
enable_async_tensor_parallel = false
pipeline_parallel_degree = 24
pipeline_parallel_schedule = "1F1B"
pipeline_parallel_microbatch_size = 1
expert_parallel_degree = 16
expert_tensor_parallel_degree = 1
[checkpoint]
enable = false
folder = "checkpoint"
interval = 1000
last_save_model_only = true
export_dtype = "float32"
async_mode = "disabled"
[activation_checkpoint]
mode = "full"
selective_ac_option = 'op'
[compile]
enable=true
components = ["loss"]
[quantize.linear.float8]
enable_fsdp_float8_all_gather = false
precompute_float8_dynamic_scale_for_fsdp = false
filter_fqns = ["output", "router.gate"]
[quantize.grouped_mm.float8]
fqns = ["experts"]
Metadata
Metadata
Assignees
Labels
questionFurther information is requestedFurther information is requested