From df2d5cf6740642dbc991a8b0f8eceafd1b5f5cac Mon Sep 17 00:00:00 2001 From: Devesh Dama Date: Mon, 9 Mar 2026 08:29:57 +0000 Subject: [PATCH] {AKS} Add provisioningState retry logic with two-phase check validation Add opt-in retry logic to the Azure CLI test SDK for handling provisioningState race conditions caused by external modifications (e.g., Azure Policy) during live tests. When enabled via AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK=true, the test framework's cmd() method uses a two-phase check approach: Phase 1: Validate provisioningState == Succeeded. If not, poll the resource via 'az resource show --ids' with exponential backoff until it reaches a terminal state (Succeeded, Failed, or Canceled). Phase 2: Validate all remaining checks against the original command result, ensuring the operation itself produced correct output. Features: - Opt-in via environment variable (zero risk to existing tests) - Only activates in live mode (playback tests unaffected) - Exponential backoff with jitter (configurable via env vars) - ETag change detection for external modification warnings - Terminal state short-circuit (Failed/Canceled stop polling) - TimeoutError after configurable max retries --- .../azure/cli/testsdk/base.py | 130 ++++++ .../tests/test_provisioning_retry.py | 406 ++++++++++++++++++ 2 files changed, 536 insertions(+) create mode 100644 src/azure-cli-testsdk/azure/cli/testsdk/scenario_tests/tests/test_provisioning_retry.py diff --git a/src/azure-cli-testsdk/azure/cli/testsdk/base.py b/src/azure-cli-testsdk/azure/cli/testsdk/base.py index 51d64ad9252..e1a9b3ad664 100644 --- a/src/azure-cli-testsdk/azure/cli/testsdk/base.py +++ b/src/azure-cli-testsdk/azure/cli/testsdk/base.py @@ -10,6 +10,8 @@ import inspect import unittest import tempfile +import time +import random from .scenario_tests import (IntegrationTestBase, ReplayableTest, SubscriptionRecordingProcessor, LargeRequestBodyProcessor, @@ -78,6 +80,129 @@ def is_empty(self): # pylint: disable=no-self-use from azure.cli.testsdk.checkers import NoneCheck return NoneCheck() + @staticmethod + def _is_provisioning_state_check(check): + """Return True if *check* is a JMESPathCheck asserting provisioningState == 'Succeeded'.""" + from azure.cli.testsdk.checkers import JMESPathCheck + if not isinstance(check, JMESPathCheck): + return False + query = getattr(check, '_query', '') + if not isinstance(query, str): + return False + query = query.lower() + is_provisioning_state = query == 'provisioningstate' or query.endswith('.provisioningstate') + return is_provisioning_state and getattr(check, '_expected_result', '') == 'Succeeded' + + def _should_retry_for_provisioning_state(self, checks): + """Check if any JMESPathCheck asserts provisioningState == 'Succeeded'.""" + env_val = os.environ.get('AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK', 'false').lower() + if not checks or env_val != 'true': + return False + + checks_list = checks if isinstance(checks, list) else [checks] + + for check in checks_list: + if self._is_provisioning_state_check(check): + return True + return False + + def _cmd_with_retry(self, command, checks, cli_ctx): + """Execute command with two-phase check validation. + + Phase 1: Validate provisioningState == Succeeded, retrying with poll if needed. + Phase 2: Validate all remaining checks on the original result. + + Uses etag to detect external modifications (e.g. Azure Policy). + """ + import jmespath + from azure.cli.testsdk.exceptions import JMESPathCheckAssertionError + + max_retries = int(os.environ.get('AZURE_CLI_TEST_MAX_RETRIES', '10')) + base_delay = int(os.environ.get('AZURE_CLI_TEST_BASE_DELAY', '2')) + max_delay = int(os.environ.get('AZURE_CLI_TEST_MAX_DELAY', '60')) + + # Split checks into Phase 1 (provisioningState) and Phase 2 (everything else) + checks_list = checks if isinstance(checks, list) else [checks] + ps_checks = [] + other_checks = [] + for c in checks_list: + if self._is_provisioning_state_check(c): + ps_checks.append(c) + else: + other_checks.append(c) + + # Execute the original command once + result = execute(cli_ctx, command, expect_failure=False) + + # Phase 1: Is the resource ready? + try: + result.assert_with_checks(ps_checks) + except JMESPathCheckAssertionError: + + # Extract resource id and etag for polling + try: + json_value = result.get_output_in_json() + resource_id = jmespath.search('id', json_value) + original_etag = jmespath.search('etag', json_value) or \ + jmespath.search('properties.etag', json_value) + except (KeyError, TypeError, ValueError, AttributeError): + resource_id = None + original_etag = None + + if not resource_id: + raise + + # Poll with generic ARM GET until provisioningState is terminal + poll_command = 'resource show --ids {}'.format(resource_id) + actual_state = None + current_etag = None + last_seen_etag = original_etag + + logger.warning( + "provisioningState was not 'Succeeded' for resource '%s'. " + "Polling with '%s' (max %d retries)...", + resource_id, poll_command, max_retries) + + for attempt in range(max_retries): + delay = min(base_delay * (2 ** attempt) + random.uniform(0, 1), max_delay) + time.sleep(delay) + + poll_result = execute(cli_ctx, poll_command, expect_failure=False) + + try: + poll_json = poll_result.get_output_in_json() + actual_state = jmespath.search('properties.provisioningState', poll_json) + current_etag = jmespath.search('etag', poll_json) + except (KeyError, TypeError, ValueError, AttributeError): + actual_state = None + current_etag = None + + if last_seen_etag and current_etag and current_etag != last_seen_etag: + logger.warning( + "ETag changed ('%s' -> '%s'): resource modified externally " + "(likely Azure Policy). Waiting for it to complete...", + last_seen_etag[:16], current_etag[:16]) + last_seen_etag = current_etag + + if actual_state == 'Succeeded': + break + + if actual_state in ('Failed', 'Canceled'): + raise AssertionError( + "Resource '{}' reached terminal state '{}' after external modification.".format( + resource_id, actual_state)) + else: + raise TimeoutError( + "Resource '{}' did not reach 'Succeeded' after {} retries. " + "Last state: '{}'. Original ETag: '{}', Current ETag: '{}'.".format( + resource_id, max_retries, actual_state, original_etag, current_etag)) + + # Phase 2: Validate the operation result + if other_checks: + result.assert_with_checks(other_checks) + + return result + class ScenarioTest(ReplayableTest, CheckerMixin, unittest.TestCase): def __init__(self, method_name, config_file=None, recording_name=None, @@ -174,6 +299,9 @@ def create_guid(self): def cmd(self, command, checks=None, expect_failure=False): command = self._apply_kwargs(command) + # Only retry in live mode — playback recordings have a fixed HTTP sequence + if self.is_live and not expect_failure and self._should_retry_for_provisioning_state(checks): + return self._cmd_with_retry(command, checks, self.cli_ctx) return execute(self.cli_ctx, command, expect_failure=expect_failure).assert_with_checks(checks) def get_subscription_id(self): @@ -230,6 +358,8 @@ def setUp(self): def cmd(self, command, checks=None, expect_failure=False): command = self._apply_kwargs(command) + if not expect_failure and self._should_retry_for_provisioning_state(checks): + return self._cmd_with_retry(command, checks, self.cli_ctx) return execute(self.cli_ctx, command, expect_failure=expect_failure).assert_with_checks(checks) def get_subscription_id(self): diff --git a/src/azure-cli-testsdk/azure/cli/testsdk/scenario_tests/tests/test_provisioning_retry.py b/src/azure-cli-testsdk/azure/cli/testsdk/scenario_tests/tests/test_provisioning_retry.py new file mode 100644 index 00000000000..153b980baed --- /dev/null +++ b/src/azure-cli-testsdk/azure/cli/testsdk/scenario_tests/tests/test_provisioning_retry.py @@ -0,0 +1,406 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- + +import os +import json +import unittest +from unittest import mock + +from azure.cli.testsdk.checkers import JMESPathCheck +from azure.cli.testsdk.exceptions import JMESPathCheckAssertionError + + +class MockExecutionResult: + """Minimal mock of ExecutionResult for testing retry logic.""" + + def __init__(self, json_data, exit_code=0): + self.output = json.dumps(json_data) + self.exit_code = exit_code + self.json_value = None + self.skip_assert = False + self.applog = '' + + def get_output_in_json(self): + if not self.json_value: + self.json_value = json.loads(self.output) + return self.json_value + + def assert_with_checks(self, *args): + checks = [] + for each in args: + if isinstance(each, list): + checks.extend(each) + elif callable(each): + checks.append(each) + if not self.skip_assert: + for c in checks: + c(self) + return self + + +class MockCheckerMixin: + """Instantiable wrapper around CheckerMixin for testing.""" + + def __init__(self): + from azure.cli.testsdk.base import CheckerMixin + # Borrow methods from CheckerMixin + self._should_retry = CheckerMixin._should_retry_for_provisioning_state.__get__(self) + self._retry = CheckerMixin._cmd_with_retry.__get__(self) + self._is_provisioning_state_check = CheckerMixin._is_provisioning_state_check + self.kwargs = {} + + +# --------------------------------------------------------------------------- +# Tests for _should_retry_for_provisioning_state +# --------------------------------------------------------------------------- + +class TestShouldRetryForProvisioningState(unittest.TestCase): + + def _make_mixin(self): + return MockCheckerMixin() + + def test_returns_false_when_env_var_not_set(self): + """Default is 'false' — retry should not trigger.""" + mixin = self._make_mixin() + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + with mock.patch.dict(os.environ, {}, clear=True): + self.assertFalse(mixin._should_retry(checks)) + + def test_returns_true_when_enabled_and_check_matches(self): + mixin = self._make_mixin() + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertTrue(mixin._should_retry(checks)) + + def test_returns_false_when_checks_is_none(self): + mixin = self._make_mixin() + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertFalse(mixin._should_retry(None)) + + def test_returns_false_when_checks_is_empty(self): + mixin = self._make_mixin() + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertFalse(mixin._should_retry([])) + + def test_returns_false_when_no_provisioning_state_check(self): + mixin = self._make_mixin() + checks = [JMESPathCheck('name', 'myCluster')] + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertFalse(mixin._should_retry(checks)) + + def test_returns_false_when_expected_value_is_not_succeeded(self): + mixin = self._make_mixin() + checks = [JMESPathCheck('provisioningState', 'Failed')] + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertFalse(mixin._should_retry(checks)) + + def test_case_insensitive_query(self): + mixin = self._make_mixin() + checks = [JMESPathCheck('ProvisioningState', 'Succeeded')] + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertTrue(mixin._should_retry(checks)) + + def test_single_check_not_in_list(self): + """Checks can be passed as a single callable, not wrapped in a list.""" + mixin = self._make_mixin() + check = JMESPathCheck('provisioningState', 'Succeeded') + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertTrue(mixin._should_retry(check)) + + def test_mixed_checks_with_one_matching(self): + mixin = self._make_mixin() + checks = [ + JMESPathCheck('name', 'myCluster'), + JMESPathCheck('provisioningState', 'Succeeded'), + JMESPathCheck('location', 'eastus'), + ] + with mock.patch.dict(os.environ, {'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true'}): + self.assertTrue(mixin._should_retry(checks)) + + + +# --------------------------------------------------------------------------- +# Tests for _cmd_with_retry +# --------------------------------------------------------------------------- + +class TestCmdWithRetry(unittest.TestCase): + + def _make_mixin(self): + return MockCheckerMixin() + + def _env(self, **overrides): + """Return env dict with retry enabled and fast delays for testing.""" + env = { + 'AZURE_CLI_TEST_RETRY_PROVISIONING_CHECK': 'true', + 'AZURE_CLI_TEST_MAX_RETRIES': '3', + 'AZURE_CLI_TEST_BASE_DELAY': '0', + 'AZURE_CLI_TEST_MAX_DELAY': '0', + } + env.update(overrides) + return env + + @mock.patch('azure.cli.testsdk.base.execute') + def test_happy_path_no_retry(self, mock_execute): + """If the original command passes all checks, return immediately.""" + mixin = self._make_mixin() + result = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Succeeded', + 'etag': 'aaa', + }) + mock_execute.return_value = result + + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + ret = mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + self.assertEqual(ret, result) + mock_execute.assert_called_once() # No polling calls + + @mock.patch('azure.cli.testsdk.base.execute') + def test_retry_then_succeed(self, mock_execute): + """Original fails, poll returns Updating then Succeeded.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'aaa', + }) + poll_updating = MockExecutionResult({ + 'properties': {'provisioningState': 'Updating'}, + 'etag': 'bbb', + }) + poll_succeeded = MockExecutionResult({ + 'properties': {'provisioningState': 'Succeeded'}, + 'etag': 'bbb', + }) + + mock_execute.side_effect = [original, poll_updating, poll_succeeded] + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + ret = mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + # Returns original result (not the poll result) + self.assertEqual(ret, original) + self.assertEqual(mock_execute.call_count, 3) # 1 original + 2 polls + + @mock.patch('azure.cli.testsdk.base.execute') + def test_etag_change_logged(self, mock_execute): + """When etag changes, a warning is logged.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'original-etag-value', + }) + poll_succeeded = MockExecutionResult({ + 'properties': {'provisioningState': 'Succeeded'}, + 'etag': 'different-etag-value', + }) + + mock_execute.side_effect = [original, poll_succeeded] + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + with mock.patch('azure.cli.testsdk.base.logger') as mock_logger: + mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + # Verify warning was logged about etag change + etag_warnings = [c for c in mock_logger.warning.call_args_list + if 'ETag changed' in c[0][0]] + self.assertEqual(len(etag_warnings), 1) + self.assertIn('ETag changed', etag_warnings[0][0][0]) + + @mock.patch('azure.cli.testsdk.base.execute') + def test_terminal_failure_raises_immediately(self, mock_execute): + """If poll returns Failed, raise immediately without further retries.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'aaa', + }) + poll_failed = MockExecutionResult({ + 'properties': {'provisioningState': 'Failed'}, + 'etag': 'bbb', + }) + + mock_execute.side_effect = [original, poll_failed] + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + with self.assertRaises(AssertionError) as ctx: + mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + self.assertIn('Failed', str(ctx.exception)) + self.assertEqual(mock_execute.call_count, 2) # 1 original + 1 poll, no more + + @mock.patch('azure.cli.testsdk.base.execute') + def test_no_resource_id_raises_original_error(self, mock_execute): + """If response has no 'id' field, raise the original assertion error.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'provisioningState': 'Updating', + 'name': 'something', + }) + mock_execute.return_value = original + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + with self.assertRaises(JMESPathCheckAssertionError): + mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + mock_execute.assert_called_once() # No polling — can't poll without resource id + + @mock.patch('azure.cli.testsdk.base.execute') + def test_timeout_raises_after_max_retries(self, mock_execute): + """If all polls return Updating, raise TimeoutError after max retries.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'aaa', + }) + poll_updating = MockExecutionResult({ + 'properties': {'provisioningState': 'Updating'}, + 'etag': 'bbb', + }) + + # 1 original + 3 polls (max_retries=3) + mock_execute.side_effect = [original, poll_updating, poll_updating, poll_updating] + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + with self.assertRaises(TimeoutError) as ctx: + mixin._retry('az aks show -g rg -n mc', checks, mock.MagicMock()) + + self.assertIn('did not reach', str(ctx.exception)) + self.assertEqual(mock_execute.call_count, 4) # 1 original + 3 polls + + @mock.patch('azure.cli.testsdk.base.execute') + def test_poll_command_uses_resource_id(self, mock_execute): + """Verify the poll uses 'resource show --ids ', not the original command.""" + mixin = self._make_mixin() + + resource_id = '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc' + original = MockExecutionResult({ + 'id': resource_id, + 'provisioningState': 'Updating', + 'etag': 'aaa', + }) + poll_succeeded = MockExecutionResult({ + 'properties': {'provisioningState': 'Succeeded'}, + 'etag': 'aaa', + }) + + mock_execute.side_effect = [original, poll_succeeded] + checks = [JMESPathCheck('provisioningState', 'Succeeded')] + + with mock.patch.dict(os.environ, self._env()): + mixin._retry('az aks update -g rg -n mc --enable-something', checks, mock.MagicMock()) + + # Second call should be the poll command, not the original update + poll_call = mock_execute.call_args_list[1] + self.assertIn('resource show --ids', str(poll_call)) + + # ----------------------------------------------------------------------- + # Two-phase check validation tests + # ----------------------------------------------------------------------- + + @mock.patch('azure.cli.testsdk.base.execute') + def test_two_phase_race_with_other_checks_passing(self, mock_execute): + """Scenario A: provisioningState=Updating (race), other checks pass after poll.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'aaa', + 'networkProfile': {'outboundType': 'managedNATGateway'}, + 'tags': {'key1': 'value1'}, + }) + poll_succeeded = MockExecutionResult({ + 'properties': {'provisioningState': 'Succeeded'}, + 'etag': 'bbb', + }) + + mock_execute.side_effect = [original, poll_succeeded] + checks = [ + JMESPathCheck('provisioningState', 'Succeeded'), + JMESPathCheck('networkProfile.outboundType', 'managedNATGateway'), + JMESPathCheck('tags.key1', 'value1'), + ] + + with mock.patch.dict(os.environ, self._env()): + ret = mixin._retry('az aks create -g rg -n mc', checks, mock.MagicMock()) + + self.assertEqual(ret, original) + self.assertEqual(mock_execute.call_count, 2) # 1 original + 1 poll + + @mock.patch('azure.cli.testsdk.base.execute') + def test_two_phase_no_race_other_check_fails(self, mock_execute): + """Scenario B: provisioningState=Succeeded, but another check fails — error propagates.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Succeeded', + 'etag': 'aaa', + 'networkProfile': {'outboundType': 'loadBalancer'}, # Wrong value! + }) + mock_execute.return_value = original + checks = [ + JMESPathCheck('provisioningState', 'Succeeded'), + JMESPathCheck('networkProfile.outboundType', 'managedNATGateway'), # Will fail + ] + + with mock.patch.dict(os.environ, self._env()): + with self.assertRaises(JMESPathCheckAssertionError) as ctx: + mixin._retry('az aks create -g rg -n mc', checks, mock.MagicMock()) + + # Error should be about outboundType, not provisioningState + self.assertIn('outboundType', str(ctx.exception)) + mock_execute.assert_called_once() # No polling — Phase 1 passed + + @mock.patch('azure.cli.testsdk.base.execute') + def test_two_phase_race_and_other_check_fails(self, mock_execute): + """Scenario C: provisioningState=Updating (race) AND another check is wrong — fails correctly.""" + mixin = self._make_mixin() + + original = MockExecutionResult({ + 'id': '/subscriptions/sub/resourceGroups/rg/providers/Microsoft.ContainerService/managedClusters/mc', + 'provisioningState': 'Updating', + 'etag': 'aaa', + 'networkProfile': {'outboundType': 'loadBalancer'}, # Wrong value! + }) + poll_succeeded = MockExecutionResult({ + 'properties': {'provisioningState': 'Succeeded'}, + 'etag': 'bbb', + }) + + mock_execute.side_effect = [original, poll_succeeded] + checks = [ + JMESPathCheck('provisioningState', 'Succeeded'), + JMESPathCheck('networkProfile.outboundType', 'managedNATGateway'), # Will fail in Phase 2 + ] + + with mock.patch.dict(os.environ, self._env()): + with self.assertRaises(JMESPathCheckAssertionError) as ctx: + mixin._retry('az aks create -g rg -n mc', checks, mock.MagicMock()) + + # Error should be about outboundType (Phase 2 failure after Phase 1 retry succeeded) + self.assertIn('outboundType', str(ctx.exception)) + self.assertEqual(mock_execute.call_count, 2) # 1 original + 1 poll + + +if __name__ == '__main__': + unittest.main()