Skip to content

Commit 3427e70

Browse files
committed
[DevSettings] Add support for selective ExtraChefAttributes updates.
Accept updates for the attribute `cluster/slurm/reconfigure_timeout`.
1 parent 8617a06 commit 3427e70

File tree

9 files changed

+649
-11
lines changed

9 files changed

+649
-11
lines changed

cli/src/pcluster/config/update_policy.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
SLURM,
2121
STORAGE_TYPES_SUPPORTING_LIVE_UPDATES,
2222
)
23+
from pcluster.utils import get_dictionary_diff, parse_json_string
2324

2425

2526
class UpdatePolicy:
@@ -510,6 +511,38 @@ def get_pool_name_from_change_paths(change):
510511
return ""
511512

512513

514+
# ExtraChefAttributes update policy helpers
515+
# Define which paths within ExtraChefAttributes JSON are updatable (dot-notation)
516+
# IMPORTANT: We assume this set to contain the path to leaf fields.
517+
EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS: set[str] = {
518+
"cluster.slurm.reconfigure_timeout",
519+
}
520+
521+
522+
def _get_non_updatable_changes(old_value: str, new_value: str) -> list[str]:
523+
"""Parse and compare two ExtraChefAttributes JSON strings, returning paths that are not updatable."""
524+
old_attrs = parse_json_string(old_value, raise_on_error=False, default={}) if old_value else {}
525+
new_attrs = parse_json_string(new_value, raise_on_error=False, default={}) if new_value else {}
526+
527+
return [p for p in get_dictionary_diff(old_attrs, new_attrs) if p not in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS]
528+
529+
530+
def condition_checker_extra_chef_attributes(change, _) -> bool:
531+
"""
532+
Check if ExtraChefAttributes changes are allowed.
533+
534+
Only changes to paths defined in EXTRA_CHEF_ATTRIBUTES_UPDATABLE_PATHS are allowed.
535+
"""
536+
return len(_get_non_updatable_changes(change.old_value, change.new_value)) == 0
537+
538+
539+
def fail_reason_extra_chef_attributes(change, _) -> str:
540+
"""Generate fail reason for ExtraChefAttributes update."""
541+
non_updatable_changes = _get_non_updatable_changes(change.old_value, change.new_value)
542+
paths_str = ", ".join(sorted(non_updatable_changes))
543+
return f"The following ExtraChefAttributes fields cannot be updated: {paths_str}"
544+
545+
513546
# Common fail_reason messages
514547
UpdatePolicy.FAIL_REASONS = {
515548
"ebs_volume_resize": "Updating the file system after a resize operation requires commands specific to your "
@@ -526,6 +559,7 @@ def get_pool_name_from_change_paths(change):
526559
"compute_or_login_nodes_running": lambda change, patch: (
527560
"The update is not supported when compute or login nodes are running"
528561
),
562+
"extra_chef_attributes_update": fail_reason_extra_chef_attributes,
529563
}
530564

531565
# Common action_needed messages
@@ -548,6 +582,9 @@ def get_pool_name_from_change_paths(change):
548582
"Stop the login nodes by setting Count parameter to 0 "
549583
"and update the cluster with the pcluster update-cluster command"
550584
),
585+
"extra_chef_attributes_update": lambda change, patch: (
586+
"Revert the non-updatable ExtraChefAttributes fields to their original values."
587+
),
551588
}
552589

553590
# Base policies
@@ -567,6 +604,15 @@ def get_pool_name_from_change_paths(change):
567604
name="SUPPORTED", level=0, fail_reason="-", condition_checker=(lambda change, patch: True)
568605
)
569606

607+
# Update policy for ExtraChefAttributes - allows updates to specific fields only
608+
UpdatePolicy.EXTRA_CHEF_ATTRIBUTES = UpdatePolicy(
609+
name="EXTRA_CHEF_ATTRIBUTES",
610+
level=5,
611+
fail_reason=UpdatePolicy.FAIL_REASONS["extra_chef_attributes_update"],
612+
action_needed=UpdatePolicy.ACTIONS_NEEDED["extra_chef_attributes_update"],
613+
condition_checker=condition_checker_extra_chef_attributes,
614+
)
615+
570616
# Checks resize of max_vcpus in Batch Compute Environment
571617
UpdatePolicy.AWSBATCH_CE_MAX_RESIZE = UpdatePolicy(
572618
name="AWSBATCH_CE_MAX_RESIZE",

cli/src/pcluster/schemas/common_schema.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,7 @@ class CookbookSchema(BaseSchema):
216216
)
217217
}
218218
)
219-
extra_chef_attributes = fields.Str(
220-
metadata={
221-
"update_policy": UpdatePolicy(
222-
UpdatePolicy.UNSUPPORTED, fail_reason=UpdatePolicy.FAIL_REASONS["cookbook_update"]
223-
)
224-
}
225-
)
219+
extra_chef_attributes = fields.Str(metadata={"update_policy": UpdatePolicy.EXTRA_CHEF_ATTRIBUTES})
226220

227221
@post_load()
228222
def make_resource(self, data, **kwargs):
@@ -275,7 +269,7 @@ def make_resource(self, data, **kwargs):
275269
class BaseDevSettingsSchema(BaseSchema):
276270
"""Represent the common schema of Dev Setting for ImageBuilder and Cluster."""
277271

278-
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
272+
cookbook = fields.Nested(CookbookSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
279273
node_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
280274
aws_batch_cli_package = fields.Str(metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
281275

cli/src/pcluster/utils.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,3 +614,79 @@ def get_needed_ultraserver_capacity_block_statuses(statuses, capacity_reservatio
614614
):
615615
needed_capacity_block_statuses.append(status)
616616
return needed_capacity_block_statuses
617+
618+
619+
def parse_json_string(value: str, raise_on_error: bool = False, default: any = None) -> any:
620+
"""
621+
Parse a JSON string into a Python object.
622+
623+
:param value: JSON string to parse
624+
:param raise_on_error: If True, raises exception on parse failure; if False, returns default
625+
:param default: Value to return on parse failure when raise_on_error is False
626+
:return: Parsed JSON object or default value on failure
627+
"""
628+
try:
629+
return json.loads(value)
630+
except (json.JSONDecodeError, TypeError) as e:
631+
LOGGER.error("Failed to parse JSON string: %s. Error: %s", value, e)
632+
if raise_on_error:
633+
raise e
634+
return default
635+
636+
637+
def _collect_leaves(d: dict, prefix: str) -> set[str]:
638+
"""Collect all leaf paths from a dict."""
639+
leaves = set()
640+
for k, v in d.items():
641+
key_path = f"{prefix}.{k}" if prefix else str(k)
642+
if isinstance(v, dict) and v:
643+
leaves.update(_collect_leaves(v, key_path))
644+
else:
645+
leaves.add(key_path)
646+
return leaves if leaves else {prefix} if prefix else set()
647+
648+
649+
def _collect_diff_for_key(d: dict, key: str, path: str) -> set[str]:
650+
"""Collect leaf paths for a key that exists only in one dictionary."""
651+
key_path = f"{path}.{key}" if path else str(key)
652+
if isinstance(d[key], dict) and d[key]:
653+
return _collect_leaves(d[key], key_path)
654+
return {key_path}
655+
656+
657+
def get_dictionary_diff(d1: dict, d2: dict, path: str = "") -> set[str]:
658+
"""
659+
Recursively find all leaf paths where two dictionaries differ.
660+
661+
Returns a set of dot-notation paths (e.g., "cluster.settings.enabled") where
662+
the dictionaries have different values, including added or removed keys.
663+
Always returns leaf paths, never intermediate dict paths.
664+
665+
:param d1: First dictionary to compare (if None, treated as empty dict)
666+
:param d2: Second dictionary to compare (if None, treated as empty dict)
667+
:param path: Current path prefix (used for recursion)
668+
:return: Set of dot-notation paths where dictionaries differ.
669+
If no differences detected return empoty set.
670+
"""
671+
d1 = d1 or {}
672+
d2 = d2 or {}
673+
674+
diffs: set[str] = set()
675+
keys1, keys2 = set(d1.keys()), set(d2.keys())
676+
677+
# Collect paths that are added or remove
678+
for k in keys1 - keys2:
679+
diffs.update(_collect_diff_for_key(d1, k, path))
680+
681+
for k in keys2 - keys1:
682+
diffs.update(_collect_diff_for_key(d2, k, path))
683+
684+
# Compare paths
685+
for k in keys1 & keys2:
686+
key_path = f"{path}.{k}" if path else str(k)
687+
if isinstance(d1[k], dict) and isinstance(d2[k], dict):
688+
diffs.update(get_dictionary_diff(d1[k], d2[k], key_path))
689+
elif d1[k] != d2[k]:
690+
diffs.add(key_path)
691+
692+
return diffs

cli/src/pcluster/validators/dev_settings_validators.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
EXTRA_CHEF_ATTRIBUTES_PATH = "DevSettings/Cookbook/ExtraChefAttributes"
1717
ATTR_IN_PLACE_UPDATE_ON_FLEET_ENABLED = "in_place_update_on_fleet_enabled"
18+
ATTR_RECONFIGURE_TIMEOUT = "cluster.slurm.reconfigure_timeout"
19+
MIN_SLURM_RECONFIGURE_TIMEOUT = 300
1820

1921

2022
class ExtraChefAttributesValidator(Validator):
@@ -29,8 +31,10 @@ def _validate(self, extra_chef_attributes: str = None):
2931
"""
3032
if not extra_chef_attributes:
3133
return
32-
else:
33-
self._validate_in_place_update_on_fleet_enabled(json.loads(extra_chef_attributes))
34+
35+
attrs = json.loads(extra_chef_attributes)
36+
self._validate_in_place_update_on_fleet_enabled(attrs)
37+
self._validate_slurm_reconfigure_timeout(attrs)
3438

3539
def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict = None):
3640
"""Validate attribute cluster.in_place_update_on_fleet_enabled.
@@ -60,3 +64,33 @@ def _validate_in_place_update_on_fleet_enabled(self, extra_chef_attributes: dict
6064
"by replacing compute and login nodes according to the selected QueueUpdateStrategy.",
6165
FailureLevel.WARNING,
6266
)
67+
68+
def _validate_slurm_reconfigure_timeout(self, extra_chef_attributes: dict = None):
69+
"""Validate attribute cluster.slurm.reconfigure-timeout.
70+
71+
Must be an integer greater than 300.
72+
73+
Args:
74+
extra_chef_attributes: Dictionary of Chef attributes to validate.
75+
"""
76+
reconfigure_timeout = dig(extra_chef_attributes, *ATTR_RECONFIGURE_TIMEOUT.split("."))
77+
78+
if reconfigure_timeout is None:
79+
return
80+
81+
# Reject booleans explicitly (bool is subclass of int in Python)
82+
if isinstance(reconfigure_timeout, bool) or not isinstance(reconfigure_timeout, int):
83+
self._add_failure(
84+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
85+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' must be an integer.",
86+
FailureLevel.ERROR,
87+
)
88+
return
89+
90+
if reconfigure_timeout <= MIN_SLURM_RECONFIGURE_TIMEOUT:
91+
self._add_failure(
92+
f"Invalid value in {EXTRA_CHEF_ATTRIBUTES_PATH}: "
93+
f"attribute '{ATTR_RECONFIGURE_TIMEOUT}' "
94+
f"must be greater than {MIN_SLURM_RECONFIGURE_TIMEOUT}.",
95+
FailureLevel.ERROR,
96+
)

0 commit comments

Comments
 (0)