Skip to content

C4 Dataset Download Rate out of API limit #2239

@zixianwang2022

Description

@zixianwang2022

Bug description

I am trying to launch a 48 nodes of training, but they are stuck at downloading the C4 dataset due to a download limit from huggingface. Are there any way to get around with this? I am using pp24, ep16, dp-ep1.

Rate limited. Waiting 196.0s before retry [Retry 1/5].
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. 
  sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. 
  sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
HTTP Error 429 thrown while requesting GET https://huggingface.co/api/datasets/allenai/c4/tree/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en?expand=false&recursive=true&limit=1000&cursor=ZXlKbWFXeGxYMjVoYldVaU9pSmxiaTlqTkMxMGNtRnBiaTR3TURrNU9TMXZaaTB3TVRBeU5DNXFjMjl1TG1kNklpd2lkSEpsWlY5dmFXUWlPaUprWXprME0yTTBZelF3WmpVelpEQXlZak14WTJWa01XUmxabUUzWlRWbU5ETTRaRFU0TmpKbEluMD06MTAwMA%3D%3D
Rate limited. Waiting 196.0s before retry [Retry 1/5].
[rank356]: Traceback (most recent call last):
[rank356]:   File "<frozen runpy>", line 198, in _run_module_as_main
[rank356]:   File "<frozen runpy>", line 88, in _run_code
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 768, in <module>
[rank356]:     main(Trainer)
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 736, in main
[rank356]:     trainer = trainer_class(config)
[rank356]:               ^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
[rank356]:     return f(*args, **kwargs)
[rank356]:            ^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 127, in __init__
[rank356]:     self.dataloader = self.train_spec.build_dataloader_fn(
[rank356]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 181, in build_text_dataloader
[rank356]:     hf_ds = HuggingFaceTextDataset(
[rank356]:             ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 87, in __init__
[rank356]:     ds = dataset_loader(path)
[rank356]:          ^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 26, in _load_c4_dataset
[rank356]:     return load_dataset(dataset_path, name="en", split=split, streaming=True)
[rank356]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1397, in load_dataset
[rank356]:     builder_instance = load_dataset_builder(
[rank356]:                        ^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1137, in load_dataset_builder
[rank356]:     dataset_module = dataset_module_factory(
[rank356]:                      ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1036, in dataset_module_factory
[rank356]:     raise e1 from None
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1009, in dataset_module_factory
[rank356]:     ).get_module()
[rank356]:       ^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 633, in get_module
[rank356]:     data_files = DataFilesDict.from_patterns(
[rank356]:                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 705, in from_patterns
[rank356]:     else DataFilesList.from_patterns(
[rank356]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 598, in from_patterns
[rank356]:     resolve_pattern(
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 364, in resolve_pattern
[rank356]:     for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
[rank356]:                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 614, in glob
[rank356]:     return super().glob(path, maxdepth=maxdepth, **kwargs)
[rank356]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/fsspec/spec.py", line 642, in glob
[rank356]:     allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
[rank356]:                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 670, in find
[rank356]:     path_info = self.info(path, revision=resolved_path.revision, **kwargs)
[rank356]:                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 801, in info
[rank356]:     self.ls(parent_path)
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 452, in ls
[rank356]:     out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
[rank356]:           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 557, in _ls_tree
[rank356]:     for path_info in tree:
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 3156, in list_repo_tree
[rank356]:     for path_info in paginate(path=tree_url, headers=headers, params={"recursive": recursive, "expand": expand}):
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_pagination.py", line 37, in paginate
[rank356]:     hf_raise_for_status(r)
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 743, in hf_raise_for_status
[rank356]:     raise _format(HfHubHTTPError, message, response) from e
[rank356]: huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-696940cb-4d8687522d2e15255beae225;86d7d923-d927-48ba-a0ed-fcd6323e126e)

Versions

I am using rocm torch 7.0.2.

toml:

[job]
dump_folder = "./outputs/titan_173B_titan_N48_PP24_EP16_MBS1_NBS30"
description = "DeepSeek-V3 173B_titan training"
print_config = false

[profiling]
enable_profiling = false
save_traces_folder = "profile_trace"
profile_freq = 10
enable_memory_snapshot = false
save_memory_snapshot_folder = "memory_snapshot"

[metrics]
log_freq = 1
disable_color_printing = false
enable_tensorboard = false
save_tb_folder = "tb"
enable_wandb = false

[model]
name = "deepseek_v3"
flavor = "173B_titan"
hf_assets_path = "./assets/hf/deepseek-moe-16b-base"

[optimizer]
name = "AdamW"
lr = 2.2e-4
eps = 1e-8

[lr_scheduler]
warmup_steps = 200
decay_ratio = 0.8
decay_type = "cosine"
min_lr_factor = 0.1

[training]
local_batch_size = 30
#local_batch_size = pipeline_parallel_microbatch_size * num_batches 
global_batch_size = 480
seq_len = 4096
max_norm = 1.0
steps = 5
dataset = "c4"

[parallelism]
# DP Settings (Fixed)
data_parallel_replicate_degree = 1
data_parallel_shard_degree = -1
fsdp_reshard_after_forward = "default"

# Variable Settings
tensor_parallel_degree = 1
enable_async_tensor_parallel = false
pipeline_parallel_degree = 24
pipeline_parallel_schedule = "1F1B"
pipeline_parallel_microbatch_size = 1 
expert_parallel_degree = 16
expert_tensor_parallel_degree = 1

[checkpoint]
enable = false
folder = "checkpoint"
interval = 1000
last_save_model_only = true
export_dtype = "float32"
async_mode = "disabled"

[activation_checkpoint]
mode = "full"
selective_ac_option = 'op'

[compile]
enable=true
components = ["loss"]

[quantize.linear.float8]
enable_fsdp_float8_all_gather = false
precompute_float8_dynamic_scale_for_fsdp = false
filter_fqns = ["output", "router.gate"]

[quantize.grouped_mm.float8]
fqns = ["experts"]

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions