Source code for sagemaker.hyperpod.cluster_management.hp_cluster_stack

  1import importlib.resources
  2import json
  3import logging
  4import uuid
  5from pydantic import Field, field_validator
  6from typing import Optional, List, Dict, Any, Union
  7import ast
  8import boto3
  9import click
 10import yaml
 11from hyperpod_cluster_stack_template.v1_0.model import ClusterStackBase
 12
 13from sagemaker.hyperpod import create_boto3_client
 14from sagemaker.hyperpod.common.telemetry import _hyperpod_telemetry_emitter
 15from sagemaker.hyperpod.common.telemetry.constants import Feature
 16
 17CAPABILITIES_FOR_STACK_CREATION = [
 18    'CAPABILITY_AUTO_EXPAND',
 19    'CAPABILITY_IAM',
 20    'CAPABILITY_NAMED_IAM'
 21]
 22log = logging.getLogger()
 23
 24
[docs] 25class HpClusterStack(ClusterStackBase): 26 """Manages SageMaker HyperPod cluster CloudFormation stacks. 27 28 This class provides functionality to create, manage, and monitor CloudFormation stacks 29 for SageMaker HyperPod clusters. It extends ClusterStackBase with stack lifecycle operations. 30 31 .. dropdown:: Usage Examples 32 :open: 33 34 .. code-block:: python 35 36 >>> # Create a cluster stack instance 37 >>> stack = HpClusterStack() 38 >>> response = stack.create(region="us-west-2") 39 >>> 40 >>> # Check stack status 41 >>> status = stack.get_status() 42 >>> print(status) 43 """ 44 stack_id: Optional[str] = Field( 45 None, 46 description="CloudFormation stack ID set after stack creation" 47 ) 48 stack_name: Optional[str] = Field( 49 None, 50 description="CloudFormation stack name set after stack creation" 51 ) 52 53 def __init__(self, **data): 54 super().__init__(**data) 55 56 @staticmethod 57 def get_template() -> str: 58 try: 59 template_content = importlib.resources.read_text( 60 'hyperpod_cluster_stack_template', 61 'creation_template.yaml' 62 ) 63 yaml_data = yaml.safe_load(template_content) 64 return json.dumps(yaml_data, indent=2, ensure_ascii=False) 65 except Exception as e: 66 raise RuntimeError(f"Failed to load template from package: {e}") 67
[docs] 68 @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_cluster_stack") 69 def create(self, 70 region: Optional[str] = None, 71 template_version: Optional[int] = 1) -> str: 72 """Creates a new HyperPod cluster CloudFormation stack. 73 74 **Parameters:** 75 76 .. list-table:: 77 :header-rows: 1 78 :widths: 20 20 60 79 80 * - Parameter 81 - Type 82 - Description 83 * - region 84 - str, optional 85 - AWS region for stack creation. Uses current session region if not specified 86 87 **Returns:** 88 89 dict: CloudFormation describe_stacks response containing stack details 90 91 **Raises:** 92 93 Exception: When CloudFormation stack creation fails 94 95 .. dropdown:: Usage Examples 96 :open: 97 98 .. code-block:: python 99 100 >>> # Create stack in default region 101 >>> stack = HpClusterStack() 102 >>> response = stack.create() 103 >>> 104 >>> # Create stack in specific region 105 >>> response = stack.create(region="us-east-1") 106 """ 107 # Get the region from the boto3 session or use the provided region 108 region = region or boto3.session.Session().region_name 109 cf = create_boto3_client('cloudformation', region_name=region) 110 111 # Convert the input object to CloudFormation parameters 112 parameters = self._create_parameters() 113 114 stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}" 115 # Use the fixed bucket name from the model 116 bucket_name = "aws-sagemaker-hyperpod-cluster-setup" 117 template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml" 118 119 try: 120 # Use TemplateURL for large templates (>51KB) 121 template_url = f"https://{bucket_name}-{region}-{self.stage}.s3.amazonaws.com/{template_key}" 122 response = cf.create_stack( 123 StackName=stack_name, 124 TemplateURL=template_url, 125 Parameters=parameters, 126 Tags=self._parse_tags(), 127 Capabilities=CAPABILITIES_FOR_STACK_CREATION 128 ) 129 130 log.info(f"Stack creation initiated. Stack ID: {response['StackId']}") 131 click.secho(f"Stack creation initiated. Stack ID: {response['StackId']}") 132 133 self.stack_id = response['StackId'] 134 # Setting the stack name here to avoid calling multiple cloud formation APIs again 135 self.stack_name = stack_name 136 137 describe_response = self.describe(stack_name, region) 138 139 return describe_response 140 except Exception as e: 141 log.error(f"Error creating stack: {e}") 142 raise
143 144 def _create_parameters(self) -> List[Dict[str, str]]: 145 parameters = [] 146 for field_name, field_info in ClusterStackBase.model_fields.items(): 147 value = getattr(self, field_name, None) 148 if value is not None: 149 # Handle array attributes that need to be converted to numbered parameters 150 if field_name == 'instance_group_settings': 151 # Handle both list and JSON string formats 152 if isinstance(value, list): 153 settings_list = value 154 else: 155 # Parse JSON string to list 156 try: 157 settings_list = json.loads(str(value)) 158 except (json.JSONDecodeError, TypeError): 159 settings_list = [] 160 161 for i, setting in enumerate(settings_list, 1): 162 formatted_setting = self._convert_nested_keys(setting) 163 parameters.append({ 164 'ParameterKey': f'InstanceGroupSettings{i}', 165 'ParameterValue': "[" + json.dumps(formatted_setting) + "]" if isinstance(formatted_setting, (dict, list)) else str(formatted_setting) 166 }) 167 elif field_name == 'rig_settings': 168 # Handle both list and JSON string formats 169 if isinstance(value, list): 170 settings_list = value 171 else: 172 # Parse JSON string to list 173 try: 174 settings_list = json.loads(str(value)) 175 except (json.JSONDecodeError, TypeError): 176 settings_list = [] 177 178 for i, setting in enumerate(settings_list, 1): 179 formatted_setting = self._convert_nested_keys(setting) 180 parameters.append({ 181 'ParameterKey': f'RigSettings{i}', 182 'ParameterValue': "[" + json.dumps(formatted_setting) + "]" if isinstance(formatted_setting, (dict, list)) else str(formatted_setting) 183 }) 184 else: 185 # Convert array fields to comma-separated strings 186 if field_name in ['availability_zone_ids', 'nat_gateway_ids', 'eks_private_subnet_ids', 187 'security_group_ids', 'private_route_table_ids', 'private_subnet_ids']: 188 if isinstance(value, list): 189 value = ','.join(str(item) for item in value) 190 elif isinstance(value, str) and value.startswith('['): 191 # Handle JSON string format from CLI 192 try: 193 parsed_list = json.loads(value) 194 value = ','.join(str(item) for item in parsed_list) 195 except (json.JSONDecodeError, TypeError): 196 pass # Keep original string value 197 # Convert tags array to JSON string 198 elif field_name == 'tags': 199 if isinstance(value, list): 200 value = json.dumps(value) 201 elif isinstance(value, str) and not value.startswith('['): 202 # If it's already a JSON string, keep it as is 203 pass 204 # Convert boolean values to strings for CloudFormation 205 elif isinstance(value, bool): 206 value = str(value).lower() 207 208 parameters.append({ 209 'ParameterKey': self._snake_to_pascal(field_name), 210 'ParameterValue': str(value) 211 }) 212 return parameters 213 214 def _parse_tags(self) -> List[Dict[str, str]]: 215 """Parse tags field and return proper CloudFormation tags format.""" 216 if not self.tags: 217 return [] 218 219 tags_list = self.tags 220 if isinstance(self.tags, str): 221 try: 222 tags_list = json.loads(self.tags) 223 except (json.JSONDecodeError, TypeError): 224 return [] 225 226 # Convert array of strings to Key-Value format 227 if isinstance(tags_list, list) and tags_list: 228 # Check if already in Key-Value format 229 if isinstance(tags_list[0], dict) and 'Key' in tags_list[0]: 230 return tags_list 231 # Convert string array to Key-Value format 232 return [{'Key': tag, 'Value': ''} for tag in tags_list if isinstance(tag, str)] 233 234 return [] 235 236 def _convert_nested_keys(self, obj: Any) -> Any: 237 """Convert nested JSON keys from snake_case to PascalCase.""" 238 if isinstance(obj, dict): 239 return {self._snake_to_pascal(k): self._convert_nested_keys(v) for k, v in obj.items()} 240 elif isinstance(obj, list): 241 return [self._convert_nested_keys(item) for item in obj] 242 return obj 243 244 @staticmethod 245 def _snake_to_pascal(snake_str: str) -> str: 246 """Convert snake_case string to PascalCase.""" 247 if not snake_str: 248 return snake_str 249 250 # Handle specific cases 251 mappings = { 252 "eks_cluster_name": "EKSClusterName", 253 "create_eks_cluster_stack": "CreateEKSClusterStack", 254 "create_hyperpod_cluster_stack": "CreateHyperPodClusterStack", 255 "create_sagemaker_iam_role_stack": "CreateSageMakerIAMRoleStack", 256 "create_vpc_stack": "CreateVPCStack", 257 "sagemaker_iam_role_name": "SageMakerIAMRoleName", 258 "vpc_cidr": "VpcCIDR", 259 "enable_hp_inference_feature": "EnableHPInferenceFeature", 260 "fsx_availability_zone_id": "FsxAvailabilityZoneId", 261 "hyperpod_cluster_name": "HyperPodClusterName", 262 "InstanceCount": "InstanceCount", 263 "InstanceGroupName": "InstanceGroupName", 264 "InstanceType": "InstanceType", 265 "TargetAvailabilityZoneId": "TargetAvailabilityZoneId", 266 "ThreadsPerCore": "ThreadsPerCore", 267 "InstanceStorageConfigs": "InstanceStorageConfigs", 268 "EbsVolumeConfig": "EbsVolumeConfig", 269 "VolumeSizeInGB": "VolumeSizeInGB" 270 } 271 272 if snake_str in mappings: 273 return mappings[snake_str] 274 275 276 # Default case: capitalize each word 277 return ''.join(word.capitalize() for word in snake_str.split('_')) 278 279 def _snake_to_camel(self, snake_str: str) -> str: 280 """Convert snake_case string to camelCase for nested JSON keys.""" 281 if not snake_str: 282 return snake_str 283 words = snake_str.split('_') 284 return words[0] + ''.join(word.capitalize() for word in words[1:]) 285
[docs] 286 @staticmethod 287 @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "describe_cluster_stack") 288 def describe(stack_name, region: Optional[str] = None): 289 """Describes a CloudFormation stack by name. 290 291 .. note:: 292 Stack descriptions are region-specific. You must use the correct region where the stack was created to retrieve its description. 293 294 **Parameters:** 295 296 .. list-table:: 297 :header-rows: 1 298 :widths: 20 20 60 299 300 * - Parameter 301 - Type 302 - Description 303 * - stack_name 304 - str 305 - Name of the CloudFormation stack to describe. For ARN format arn:aws:cloudformation:region:account:stack/stack-name/stack-id, use the stack-name part 306 * - region 307 - str, optional 308 - AWS region where the stack exists 309 310 **Returns:** 311 312 dict: CloudFormation describe_stacks response 313 314 **Raises:** 315 316 ValueError: When stack is not accessible or doesn't exist 317 RuntimeError: When CloudFormation operation fails 318 319 .. dropdown:: Usage Examples 320 :open: 321 322 .. code-block:: python 323 324 >>> # Describe a stack by name 325 >>> response = HpClusterStack.describe("my-stack-name") 326 >>> 327 >>> # Describe stack in specific region 328 >>> response = HpClusterStack.describe("my-stack", region="us-west-2") 329 """ 330 cf = create_boto3_client('cloudformation', region_name=region) 331 332 try: 333 response = cf.describe_stacks(StackName=stack_name) 334 return response 335 except cf.exceptions.ClientError as e: 336 error_code = e.response['Error']['Code'] 337 338 log.debug(f"CloudFormation error: {error_code} for operation on stack") 339 340 if error_code in ['ValidationError', 'AccessDenied']: 341 log.error("Stack operation failed - check stack name and permissions") 342 raise ValueError("Stack not accessible") 343 else: 344 log.error("CloudFormation operation failed") 345 raise RuntimeError("Stack operation failed") 346 except Exception as e: 347 log.error("Unexpected error during stack operation") 348 raise RuntimeError("Stack operation failed")
349
[docs] 350 @staticmethod 351 @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_cluster_stack") 352 def list(region: Optional[str] = None, stack_status_filter: Optional[List[str]] = None): 353 """Lists all CloudFormation stacks in the specified region. 354 355 .. note:: 356 Stack listings are region-specific. If no region is provided, uses the default region from your AWS configuration. 357 358 **Parameters:** 359 360 .. list-table:: 361 :header-rows: 1 362 :widths: 20 20 60 363 364 * - Parameter 365 - Type 366 - Description 367 * - region 368 - str, optional 369 - AWS region to list stacks from. Uses default region if not specified 370 371 **Returns:** 372 373 dict: CloudFormation list_stacks response containing stack summaries 374 375 **Raises:** 376 377 ValueError: When insufficient permissions to list stacks 378 RuntimeError: When CloudFormation list operation fails 379 380 .. dropdown:: Usage Examples 381 :open: 382 383 .. code-block:: python 384 385 >>> # List stacks in current region 386 >>> stacks = HpClusterStack.list() 387 >>> 388 >>> # List stacks in specific region 389 >>> stacks = HpClusterStack.list(region="us-east-1") 390 """ 391 cf = create_boto3_client('cloudformation', region_name=region) 392 393 # All valid stack statuses except DELETE_COMPLETE, used to avoid paginating 394 # through tens of thousands of deleted stacks which causes throttling. 395 _ACTIVE_STACK_STATUSES = [ 396 'CREATE_IN_PROGRESS', 'CREATE_FAILED', 'CREATE_COMPLETE', 397 'ROLLBACK_IN_PROGRESS', 'ROLLBACK_FAILED', 'ROLLBACK_COMPLETE', 398 'DELETE_IN_PROGRESS', 'DELETE_FAILED', 399 'UPDATE_IN_PROGRESS', 'UPDATE_COMPLETE_CLEANUP_IN_PROGRESS', 400 'UPDATE_COMPLETE', 'UPDATE_FAILED', 401 'UPDATE_ROLLBACK_IN_PROGRESS', 'UPDATE_ROLLBACK_FAILED', 402 'UPDATE_ROLLBACK_COMPLETE_CLEANUP_IN_PROGRESS', 'UPDATE_ROLLBACK_COMPLETE', 403 'REVIEW_IN_PROGRESS', 'IMPORT_IN_PROGRESS', 'IMPORT_COMPLETE', 404 'IMPORT_ROLLBACK_IN_PROGRESS', 'IMPORT_ROLLBACK_FAILED', 'IMPORT_ROLLBACK_COMPLETE', 405 ] 406 407 try: 408 # Prepare API call parameters 409 list_params = {} 410 411 if stack_status_filter is not None: 412 list_params['StackStatusFilter'] = stack_status_filter 413 else: 414 # Exclude DELETE_COMPLETE at the API level to avoid paginating through 415 # large numbers of deleted stacks, which causes throttling errors. 416 list_params['StackStatusFilter'] = _ACTIVE_STACK_STATUSES 417 418 response = cf.list_stacks(**list_params) 419 420 # Paginate through all results 421 all_summaries = response.get('StackSummaries', []) 422 while 'NextToken' in response: 423 list_params['NextToken'] = response['NextToken'] 424 response = cf.list_stacks(**list_params) 425 all_summaries.extend(response.get('StackSummaries', [])) 426 427 return {'StackSummaries': all_summaries} 428 except cf.exceptions.ClientError as e: 429 error_code = e.response['Error']['Code'] 430 431 log.debug(f"CloudFormation error: {error_code} for list stacks operation") 432 433 if error_code == 'AccessDenied': 434 log.error("List stacks operation failed - check permissions") 435 raise ValueError("Insufficient permissions to list stacks") 436 else: 437 log.error("CloudFormation list operation failed") 438 raise RuntimeError("List stacks operation failed") 439 except Exception as e: 440 log.error("Unexpected error during list stacks operation") 441 raise RuntimeError("List stacks operation failed")
442 443 @staticmethod 444 def _get_stack_status_helper(stack_name: str, region: Optional[str] = None): 445 """Helper method to get stack status for any stack identifier.""" 446 log.debug(f"Getting status for stack: {stack_name}") 447 stack_description = HpClusterStack.describe(stack_name, region) 448 449 if stack_description.get('Stacks'): 450 status = stack_description['Stacks'][0].get('StackStatus') 451 log.debug(f"Stack {stack_name} status: {status}") 452 return status 453 454 log.debug(f"Stack {stack_name} not found") 455 click.secho(f"Stack {stack_name} not found") 456 return None 457
[docs] 458 def get_status(self, region: Optional[str] = None): 459 """Gets the status of the current stack instance. 460 461 **Parameters:** 462 463 .. list-table:: 464 :header-rows: 1 465 :widths: 20 20 60 466 467 * - Parameter 468 - Type 469 - Description 470 * - region 471 - str, optional 472 - AWS region where the stack exists 473 474 **Returns:** 475 476 str: CloudFormation stack status (e.g., 'CREATE_COMPLETE', 'UPDATE_IN_PROGRESS') 477 478 **Raises:** 479 480 ValueError: When stack hasn't been created yet (call create() first) 481 482 .. dropdown:: Usage Examples 483 :open: 484 485 .. code-block:: python 486 487 >>> # Create stack first, then check status 488 >>> stack = HpClusterStack() 489 >>> stack.create() 490 >>> status = stack.get_status() 491 >>> print(f"Stack status: {status}") 492 """ 493 if not self.stack_name: 494 raise ValueError("Stack must be created first. Call create() before checking status.") 495 return self._get_stack_status_helper(self.stack_name, region)
496
[docs] 497 @staticmethod 498 def check_status(stack_name: str, region: Optional[str] = None): 499 """Checks the status of any CloudFormation stack by name. 500 501 **Parameters:** 502 503 .. list-table:: 504 :header-rows: 1 505 :widths: 20 20 60 506 507 * - Parameter 508 - Type 509 - Description 510 * - stack_name 511 - str 512 - Name of the CloudFormation stack 513 * - region 514 - str, optional 515 - AWS region where the stack exists 516 517 **Returns:** 518 519 str: CloudFormation stack status or None if stack not found 520 521 .. dropdown:: Usage Examples 522 :open: 523 524 .. code-block:: python 525 526 >>> # Check status of any stack 527 >>> status = HpClusterStack.check_status("my-stack-name") 528 >>> 529 >>> # Check status in specific region 530 >>> status = HpClusterStack.check_status("my-stack", region="us-west-2") 531 """ 532 return HpClusterStack._get_stack_status_helper(stack_name, region)
533 534 @staticmethod 535 def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None, 536 logger: Optional[logging.Logger] = None) -> None: 537 """Deletes a HyperPod cluster CloudFormation stack. 538 539 Removes the specified CloudFormation stack and all associated AWS resources. 540 This operation cannot be undone and proceeds automatically without confirmation. 541 542 **Parameters:** 543 544 .. list-table:: 545 :header-rows: 1 546 :widths: 20 20 60 547 548 * - Parameter 549 - Type 550 - Description 551 * - stack_name 552 - str 553 - Name of the CloudFormation stack to delete 554 * - region 555 - str, optional 556 - AWS region where the stack exists 557 * - retain_resources 558 - List[str], optional 559 - List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks) 560 * - logger 561 - logging.Logger, optional 562 - Logger instance for output messages. Uses default logger if not provided 563 564 **Raises:** 565 566 ValueError: When stack doesn't exist or retain_resources limitation is encountered 567 RuntimeError: When CloudFormation deletion fails 568 Exception: For other deletion errors 569 570 .. dropdown:: Usage Examples 571 :open: 572 573 .. code-block:: python 574 575 >>> # Delete a stack (automatically proceeds without confirmation) 576 >>> HpClusterStack.delete("my-stack-name") 577 >>> 578 >>> # Delete in specific region 579 >>> HpClusterStack.delete("my-stack-name", region="us-west-2") 580 >>> 581 >>> # Delete with retained resources (only works on DELETE_FAILED stacks) 582 >>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"]) 583 >>> 584 >>> # Delete with custom logger 585 >>> import logging 586 >>> logger = logging.getLogger(__name__) 587 >>> HpClusterStack.delete("my-stack-name", logger=logger) 588 """ 589 from sagemaker.hyperpod.cli.cluster_stack_utils import ( 590 delete_stack_with_confirmation, 591 StackNotFoundError 592 ) 593 594 if logger is None: 595 logger = logging.getLogger(__name__) 596 597 # Convert retain_resources list to comma-separated string for the utility function 598 retain_resources_str = ",".join(retain_resources) if retain_resources else "" 599 600 def sdk_confirm_callback(message: str) -> bool: 601 """SDK-specific confirmation callback - always auto-confirms.""" 602 logger.info(f"Auto-confirming: {message}") 603 return True 604 605 try: 606 delete_stack_with_confirmation( 607 stack_name=stack_name, 608 region=region or boto3.session.Session().region_name, 609 retain_resources_str=retain_resources_str, 610 message_callback=logger.info, 611 confirm_callback=sdk_confirm_callback, 612 success_callback=logger.info 613 ) 614 except StackNotFoundError: 615 error_msg = f"Stack '{stack_name}' not found" 616 logger.error(error_msg) 617 raise ValueError(error_msg) 618 except Exception as e: 619 error_str = str(e) 620 621 # Handle CloudFormation retain-resources limitation with clear exception for SDK 622 if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: 623 error_msg = ( 624 f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. " 625 f"Current stack state allows normal deletion. Try deleting without retain_resources first, " 626 f"then retry with retain_resources if deletion fails." 627 ) 628 logger.error(error_msg) 629 raise ValueError(error_msg) 630 631 # Handle termination protection 632 if "TerminationProtection is enabled" in error_str: 633 error_msg = ( 634 f"Stack deletion blocked: Termination Protection is enabled. " 635 f"Disable termination protection first using AWS CLI or Console." 636 ) 637 logger.error(error_msg) 638 raise RuntimeError(error_msg) 639 640 # Handle other errors 641 logger.error(f"Failed to delete stack: {error_str}") 642 raise RuntimeError(f"Stack deletion failed: {error_str}") 643
[docs] 644 @staticmethod 645 def delete(stack_name: str, region: Optional[str] = None, retain_resources: Optional[List[str]] = None, 646 logger: Optional[logging.Logger] = None) -> None: 647 """Deletes a HyperPod cluster CloudFormation stack. 648 649 Removes the specified CloudFormation stack and all associated AWS resources. 650 This operation cannot be undone and proceeds automatically without confirmation. 651 652 **Parameters:** 653 654 .. list-table:: 655 :header-rows: 1 656 :widths: 20 20 60 657 658 * - Parameter 659 - Type 660 - Description 661 * - stack_name 662 - str 663 - Name of the CloudFormation stack to delete 664 * - region 665 - str, optional 666 - AWS region where the stack exists 667 * - retain_resources 668 - List[str], optional 669 - List of logical resource IDs to retain during deletion (only works on DELETE_FAILED stacks) 670 * - logger 671 - logging.Logger, optional 672 - Logger instance for output messages. Uses default logger if not provided 673 674 **Raises:** 675 676 ValueError: When stack doesn't exist or retain_resources limitation is encountered 677 RuntimeError: When CloudFormation deletion fails 678 Exception: For other deletion errors 679 680 .. dropdown:: Usage Examples 681 :open: 682 683 .. code-block:: python 684 685 >>> # Delete a stack (automatically proceeds without confirmation) 686 >>> HpClusterStack.delete("my-stack-name") 687 >>> 688 >>> # Delete in specific region 689 >>> HpClusterStack.delete("my-stack-name", region="us-west-2") 690 >>> 691 >>> # Delete with retained resources (only works on DELETE_FAILED stacks) 692 >>> HpClusterStack.delete("my-stack-name", retain_resources=["S3Bucket", "EFSFileSystem"]) 693 >>> 694 >>> # Delete with custom logger 695 >>> import logging 696 >>> logger = logging.getLogger(__name__) 697 >>> HpClusterStack.delete("my-stack-name", logger=logger) 698 """ 699 from sagemaker.hyperpod.cli.cluster_stack_utils import ( 700 delete_stack_with_confirmation, 701 StackNotFoundError 702 ) 703 704 if logger is None: 705 logger = logging.getLogger(__name__) 706 707 # Convert retain_resources list to comma-separated string for the utility function 708 retain_resources_str = ",".join(retain_resources) if retain_resources else "" 709 710 def sdk_confirm_callback(message: str) -> bool: 711 """SDK-specific confirmation callback - always auto-confirms.""" 712 logger.info(f"Auto-confirming: {message}") 713 return True 714 715 try: 716 delete_stack_with_confirmation( 717 stack_name=stack_name, 718 region=region or boto3.session.Session().region_name, 719 retain_resources_str=retain_resources_str, 720 message_callback=logger.info, 721 confirm_callback=sdk_confirm_callback, 722 success_callback=logger.info 723 ) 724 except StackNotFoundError: 725 error_msg = f"Stack '{stack_name}' not found" 726 logger.error(error_msg) 727 raise ValueError(error_msg) 728 except Exception as e: 729 error_str = str(e) 730 731 # Handle CloudFormation retain-resources limitation with clear exception for SDK 732 if retain_resources and "specify which resources to retain only when the stack is in the DELETE_FAILED state" in error_str: 733 error_msg = ( 734 f"CloudFormation limitation: retain_resources can only be used on stacks in DELETE_FAILED state. " 735 f"Current stack state allows normal deletion. Try deleting without retain_resources first, " 736 f"then retry with retain_resources if deletion fails." 737 ) 738 logger.error(error_msg) 739 raise ValueError(error_msg) 740 741 # Handle termination protection 742 if "TerminationProtection is enabled" in error_str: 743 error_msg = ( 744 f"Stack deletion blocked: Termination Protection is enabled. " 745 f"Disable termination protection first using AWS CLI or Console." 746 ) 747 logger.error(error_msg) 748 raise RuntimeError(error_msg) 749 750 # Handle other errors 751 logger.error(f"Failed to delete stack: {error_str}") 752 raise RuntimeError(f"Stack deletion failed: {error_str}")
753 754 755 def _yaml_to_json_string(yaml_path) -> str: 756 """Convert YAML file to JSON string""" 757 with open(yaml_path, 'r') as file: 758 yaml_data = yaml.safe_load(file) 759 return json.dumps(yaml_data, indent=2, ensure_ascii=False)