C4 Dataset Download Rate out of API limit

### Bug description

I am trying to launch a 48 nodes of training, but they are stuck at downloading the C4 dataset due to a download limit from huggingface. Are there any way to get around with this? I am using pp24, ep16, dp-ep1. 

```
Rate limited. Waiting 196.0s before retry [Retry 1/5].
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. 
  sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/device_mesh.py:604: UserWarning: Slicing a flattened dim from root mesh will be deprecated in PT 2.11. Users need to bookkeep the flattened mesh directly. 
  sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
HTTP Error 429 thrown while requesting GET https://huggingface.co/api/datasets/allenai/c4/tree/1588ec454efa1a09f29cd18ddd04fe05fc8653a2/en?expand=false&recursive=true&limit=1000&cursor=ZXlKbWFXeGxYMjVoYldVaU9pSmxiaTlqTkMxMGNtRnBiaTR3TURrNU9TMXZaaTB3TVRBeU5DNXFjMjl1TG1kNklpd2lkSEpsWlY5dmFXUWlPaUprWXprME0yTTBZelF3WmpVelpEQXlZak14WTJWa01XUmxabUUzWlRWbU5ETTRaRFU0TmpKbEluMD06MTAwMA%3D%3D
Rate limited. Waiting 196.0s before retry [Retry 1/5].
[rank356]: Traceback (most recent call last):
[rank356]:   File "<frozen runpy>", line 198, in _run_module_as_main
[rank356]:   File "<frozen runpy>", line 88, in _run_code
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 768, in <module>
[rank356]:     main(Trainer)
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 736, in main
[rank356]:     trainer = trainer_class(config)
[rank356]:               ^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 362, in wrapper
[rank356]:     return f(*args, **kwargs)
[rank356]:            ^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/train.py", line 127, in __init__
[rank356]:     self.dataloader = self.train_spec.build_dataloader_fn(
[rank356]:                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 181, in build_text_dataloader
[rank356]:     hf_ds = HuggingFaceTextDataset(
[rank356]:             ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 87, in __init__
[rank356]:     ds = dataset_loader(path)
[rank356]:          ^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/gen150/scratch/zixianw4/torchtitan/torchtitan/hf_datasets/text_datasets.py", line 26, in _load_c4_dataset
[rank356]:     return load_dataset(dataset_path, name="en", split=split, streaming=True)
[rank356]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1397, in load_dataset
[rank356]:     builder_instance = load_dataset_builder(
[rank356]:                        ^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1137, in load_dataset_builder
[rank356]:     dataset_module = dataset_module_factory(
[rank356]:                      ^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1036, in dataset_module_factory
[rank356]:     raise e1 from None
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 1009, in dataset_module_factory
[rank356]:     ).get_module()
[rank356]:       ^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/load.py", line 633, in get_module
[rank356]:     data_files = DataFilesDict.from_patterns(
[rank356]:                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 705, in from_patterns
[rank356]:     else DataFilesList.from_patterns(
[rank356]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 598, in from_patterns
[rank356]:     resolve_pattern(
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/datasets/data_files.py", line 364, in resolve_pattern
[rank356]:     for filepath, info in fs.glob(pattern, detail=True, **glob_kwargs).items()
[rank356]:                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 614, in glob
[rank356]:     return super().glob(path, maxdepth=maxdepth, **kwargs)
[rank356]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/fsspec/spec.py", line 642, in glob
[rank356]:     allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
[rank356]:                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 670, in find
[rank356]:     path_info = self.info(path, revision=resolved_path.revision, **kwargs)
[rank356]:                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 801, in info
[rank356]:     self.ls(parent_path)
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 452, in ls
[rank356]:     out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
[rank356]:           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_file_system.py", line 557, in _ls_tree
[rank356]:     for path_info in tree:
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/hf_api.py", line 3156, in list_repo_tree
[rank356]:     for path_info in paginate(path=tree_url, headers=headers, params={"recursive": recursive, "expand": expand}):
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_pagination.py", line 37, in paginate
[rank356]:     hf_raise_for_status(r)
[rank356]:   File "/lustre/orion/world-shared/gen150/zixianw4/envs/TORCH-ROCM7.0.2_env/lib/python3.11/site-packages/huggingface_hub/utils/_http.py", line 743, in hf_raise_for_status
[rank356]:     raise _format(HfHubHTTPError, message, response) from e
[rank356]: huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-696940cb-4d8687522d2e15255beae225;86d7d923-d927-48ba-a0ed-fcd6323e126e)
```

### Versions

I am using rocm torch 7.0.2. 


toml:


```
[job]
dump_folder = "./outputs/titan_173B_titan_N48_PP24_EP16_MBS1_NBS30"
description = "DeepSeek-V3 173B_titan training"
print_config = false

[profiling]
enable_profiling = false
save_traces_folder = "profile_trace"
profile_freq = 10
enable_memory_snapshot = false
save_memory_snapshot_folder = "memory_snapshot"

[metrics]
log_freq = 1
disable_color_printing = false
enable_tensorboard = false
save_tb_folder = "tb"
enable_wandb = false

[model]
name = "deepseek_v3"
flavor = "173B_titan"
hf_assets_path = "./assets/hf/deepseek-moe-16b-base"

[optimizer]
name = "AdamW"
lr = 2.2e-4
eps = 1e-8

[lr_scheduler]
warmup_steps = 200
decay_ratio = 0.8
decay_type = "cosine"
min_lr_factor = 0.1

[training]
local_batch_size = 30
#local_batch_size = pipeline_parallel_microbatch_size * num_batches 
global_batch_size = 480
seq_len = 4096
max_norm = 1.0
steps = 5
dataset = "c4"

[parallelism]
# DP Settings (Fixed)
data_parallel_replicate_degree = 1
data_parallel_shard_degree = -1
fsdp_reshard_after_forward = "default"

# Variable Settings
tensor_parallel_degree = 1
enable_async_tensor_parallel = false
pipeline_parallel_degree = 24
pipeline_parallel_schedule = "1F1B"
pipeline_parallel_microbatch_size = 1 
expert_parallel_degree = 16
expert_tensor_parallel_degree = 1

[checkpoint]
enable = false
folder = "checkpoint"
interval = 1000
last_save_model_only = true
export_dtype = "float32"
async_mode = "disabled"

[activation_checkpoint]
mode = "full"
selective_ac_option = 'op'

[compile]
enable=true
components = ["loss"]

[quantize.linear.float8]
enable_fsdp_float8_all_gather = false
precompute_float8_dynamic_scale_for_fsdp = false
filter_fqns = ["output", "router.gate"]

[quantize.grouped_mm.float8]
fqns = ["experts"]
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

C4 Dataset Download Rate out of API limit #2239

Bug description

Versions

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

C4 Dataset Download Rate out of API limit #2239

Description

Bug description

Versions

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions