From 150de9b657091d3c775458215194bd505ee56a95 Mon Sep 17 00:00:00 2001
From: maddieford <93676569+maddieford@users.noreply.github.com>
Date: Thu, 11 Jan 2024 09:59:59 -0800
Subject: [PATCH 01/11] Daemon should remove stale published_hostname file and
 log useful warning (#3016)

* Daemon should remove published_hostname file and log useful warning

* Clean up fast track file if vm id has changed

* Clean up initial_goal_state file if vm id has changed

* Clean up rsm_update file if vm id has changed
---
 azurelinuxagent/pa/deprovision/default.py | 6 +++++-
 azurelinuxagent/pa/provision/default.py   | 8 +++++---
 2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/azurelinuxagent/pa/deprovision/default.py b/azurelinuxagent/pa/deprovision/default.py
index edf736811..35b4ae82e 100644
--- a/azurelinuxagent/pa/deprovision/default.py
+++ b/azurelinuxagent/pa/deprovision/default.py
@@ -158,7 +158,11 @@ def del_lib_dir_files(self, warnings, actions):  # pylint: disable=W0613
             'partition',
             'Protocol',
             'SharedConfig.xml',
-            'WireServerEndpoint'
+            'WireServerEndpoint',
+            'published_hostname',
+            'fast_track.json',
+            'initial_goal_state',
+            'rsm_update.json'
         ]
         known_files_glob = [
             'Extensions.*.xml',
diff --git a/azurelinuxagent/pa/provision/default.py b/azurelinuxagent/pa/provision/default.py
index 91fe04eda..a872d70fd 100644
--- a/azurelinuxagent/pa/provision/default.py
+++ b/azurelinuxagent/pa/provision/default.py
@@ -172,9 +172,11 @@ def check_provisioned_file(self):
         s = fileutil.read_file(ProvisionHandler.provisioned_file_path()).strip()
         if not self.osutil.is_current_instance_id(s):
             if len(s) > 0:
-                logger.warn("VM is provisioned, "
-                            "but the VM unique identifier has changed -- "
-                            "clearing cached state")
+                msg = "VM is provisioned, but the VM unique identifier has changed. This indicates the VM may be " \
+                      "created from an image that was not properly deprovisioned or generalized, which can result in " \
+                      "unexpected behavior from the guest agent -- clearing cached state"
+                logger.warn(msg)
+                self.report_event(msg)
                 from azurelinuxagent.pa.deprovision \
                     import get_deprovision_handler
                 deprovision_handler = get_deprovision_handler()

From 24b9f5aff7eace97643f7c57cf4e5a313b6bd45f Mon Sep 17 00:00:00 2001
From: Norberto Arrieta <narrieta@users.noreply.github.com>
Date: Fri, 12 Jan 2024 07:35:00 -0800
Subject: [PATCH 02/11] Do not report TestFailedException in test results
 (#3019)

Co-authored-by: narrieta <narrieta>
---
 tests_e2e/orchestrator/lib/agent_junit.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests_e2e/orchestrator/lib/agent_junit.py b/tests_e2e/orchestrator/lib/agent_junit.py
index 78b7e3584..47a5e7d69 100644
--- a/tests_e2e/orchestrator/lib/agent_junit.py
+++ b/tests_e2e/orchestrator/lib/agent_junit.py
@@ -55,6 +55,9 @@ def _received_message(self, message: MessageBase) -> None:
             if "Unexpected error in AgentTestSuite" in message.message:
                 # Ignore these errors, they are already reported as AgentTestResultMessages
                 return
+            if "TestFailedException" in message.message:
+                # Ignore these errors, they are already reported as test failures
+                return
             # Change the suite name to "_Runbook_" for LISA messages in order to separate them
             # from actual test results.
             message.suite_full_name = "_Runbook_"

From ddfb24e1d99a168c6b64af6409134b33ce598d50 Mon Sep 17 00:00:00 2001
From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com>
Date: Fri, 12 Jan 2024 10:47:09 -0800
Subject: [PATCH 03/11] skip agent update run on arm64 distros (#3018)

---
 tests_e2e/test_suites/agent_update.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests_e2e/test_suites/agent_update.yml b/tests_e2e/test_suites/agent_update.yml
index df25a9221..b78f4109f 100644
--- a/tests_e2e/test_suites/agent_update.yml
+++ b/tests_e2e/test_suites/agent_update.yml
@@ -3,11 +3,11 @@
 # Self-update: If vm not enrolled into RSM, it will validate agent uses self-update to update to latest version published
 name: "AgentUpdate"
 tests:
-#  - "agent_update/rsm_update.py"  will enable this test once we have a new test version published
+#  - "agent_update/rsm_update.py"  TODO: will enable this test once we have a new test version published
    - "agent_update/self_update.py"
 images:
     - "random(endorsed, 10)"
-    - "random(endorsed-arm64, 2)"
+#    - "random(endorsed-arm64, 2)"  TODO: HGPA not deployed on some arm64 hosts(so agent stuck on Vmesttings calls as per contract) and will enable once HGPA deployed there
 locations: "AzureCloud:eastus2euap"
 owns_vm: true
 skip_on_clouds:

From c24a9b6d5b53c8afbd6f8385124c3d48b66f809f Mon Sep 17 00:00:00 2001
From: Norberto Arrieta <narrieta@users.noreply.github.com>
Date: Fri, 12 Jan 2024 11:21:12 -0800
Subject: [PATCH 04/11] Clean test VMs older than 12 hours (#3021)

Co-authored-by: narrieta <narrieta>
---
 tests_e2e/pipeline/pipeline-cleanup.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests_e2e/pipeline/pipeline-cleanup.yml b/tests_e2e/pipeline/pipeline-cleanup.yml
index 7f9f57a6c..69e929be5 100644
--- a/tests_e2e/pipeline/pipeline-cleanup.yml
+++ b/tests_e2e/pipeline/pipeline-cleanup.yml
@@ -13,7 +13,7 @@ parameters:
   - name: older_than
     displayName: Delete resources older than (use the syntax of the "date -d" command)
     type: string
-    default: 1 day ago
+    default: 12 hours ago
 
   - name: service_connections
     type: object

From c7757023668aa2ebce07c3ae42f353c490320b2a Mon Sep 17 00:00:00 2001
From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com>
Date: Tue, 16 Jan 2024 15:59:58 -0800
Subject: [PATCH 05/11] honor rsm update with no time when agent receives new
 GS (#3015)

* honor rsm update immediately

* pylint

* improve msg

* address comments

* address comments

* address comments

* added verbose logging
---
 azurelinuxagent/common/exception.py           |  9 ++++
 azurelinuxagent/ga/agent_update_handler.py    | 52 ++++++++++++-------
 azurelinuxagent/ga/ga_version_updater.py      | 25 ++++-----
 azurelinuxagent/ga/rsm_version_updater.py     | 15 +++---
 .../ga/self_update_version_updater.py         | 15 +++---
 tests/ga/test_agent_update_handler.py         | 10 ++++
 tests/ga/test_update.py                       |  6 ++-
 7 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py
index 603ed1aa2..42170db85 100644
--- a/azurelinuxagent/common/exception.py
+++ b/azurelinuxagent/common/exception.py
@@ -84,6 +84,15 @@ def __init__(self, msg=None, inner=None):
         super(AgentUpdateError, self).__init__(msg, inner)
 
 
+class AgentFamilyMissingError(AgentError):
+    """
+    When agent family is missing.
+    """
+
+    def __init__(self, msg=None, inner=None):
+        super(AgentFamilyMissingError, self).__init__(msg, inner)
+
+
 class CGroupsException(AgentError):
     """
     Exception to classify any cgroups related issue.
diff --git a/azurelinuxagent/ga/agent_update_handler.py b/azurelinuxagent/ga/agent_update_handler.py
index ed157bdf5..f34235702 100644
--- a/azurelinuxagent/ga/agent_update_handler.py
+++ b/azurelinuxagent/ga/agent_update_handler.py
@@ -15,18 +15,16 @@
 # limitations under the License.
 #
 # Requires Python 2.6+ and Openssl 1.0+
-import datetime
 import os
 
 from azurelinuxagent.common import conf, logger
 from azurelinuxagent.common.event import add_event, WALAEventOperation
-from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
+from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError, AgentFamilyMissingError
 from azurelinuxagent.common.future import ustr
 from azurelinuxagent.common.protocol.restapi import VMAgentUpdateStatuses, VMAgentUpdateStatus, VERSION_0
 from azurelinuxagent.common.utils import textutil
 from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
 from azurelinuxagent.common.version import get_daemon_version
-from azurelinuxagent.ga.ga_version_updater import RSMUpdates
 from azurelinuxagent.ga.rsm_version_updater import RSMVersionUpdater
 from azurelinuxagent.ga.self_update_version_updater import SelfUpdateVersionUpdater
 
@@ -67,7 +65,7 @@ def __init__(self, protocol):
 
         # restore the state of rsm update. Default to self-update if last update is not with RSM.
         if not self._get_is_last_update_with_rsm():
-            self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.min)
+            self._updater = SelfUpdateVersionUpdater(self._gs_id)
         else:
             self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
 
@@ -117,7 +115,7 @@ def _get_agent_family_manifest(self, goal_state):
         """
         Get the agent_family from last GS for the given family
         Returns: first entry of Manifest
-                 Exception if no manifests found in the last GS
+                 Exception if no manifests found in the last GS and log it only on new goal state
         """
         family = self._ga_family_type
         agent_families = goal_state.extensions_goal_state.agent_families
@@ -130,11 +128,13 @@ def _get_agent_family_manifest(self, goal_state):
                     agent_family_manifests.append(m)
 
         if not family_found:
-            raise AgentUpdateError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update".format(family, self._gs_id))
+            raise AgentFamilyMissingError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update \n"
+                                          u"[Note: This error is permanent for this goal state and Will not log same error until we receive new goal state]".format(family, self._gs_id))
 
         if len(agent_family_manifests) == 0:
-            raise AgentUpdateError(
-                u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update".format(
+            raise AgentFamilyMissingError(
+                u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update \n"
+                u"[Note: This error is permanent for this goal state and will not log same error until we receive new goal state]".format(
                     family, self._gs_id))
         return agent_family_manifests[0]
 
@@ -145,30 +145,38 @@ def run(self, goal_state, ext_gs_updated):
             if not conf.get_autoupdate_enabled() or not conf.get_download_new_agents():
                 return
 
-            # verify if agent update is allowed this time (RSM checks new goal state; self-update checks manifest download interval)
-            if not self._updater.is_update_allowed_this_time(ext_gs_updated):
-                return
+            # Update the state only on new goal state
+            if ext_gs_updated:
+                self._gs_id = goal_state.extensions_goal_state.id
+                self._updater.sync_new_gs_id(self._gs_id)
 
-            self._gs_id = goal_state.extensions_goal_state.id
             agent_family = self._get_agent_family_manifest(goal_state)
 
-            # updater will return RSM enabled or disabled if we need to switch to self-update or rsm update
-            updater_mode = self._updater.check_and_switch_updater_if_changed(agent_family, self._gs_id, ext_gs_updated)
+            # Updater will return True or False if we need to switch the updater
+            # If self-updater receives RSM update enabled, it will switch to RSM updater
+            # If RSM updater receives RSM update disabled, it will switch to self-update
+            # No change in updater if GS not updated
+            is_rsm_update_enabled = self._updater.is_rsm_update_enabled(agent_family, ext_gs_updated)
 
-            if updater_mode == RSMUpdates.Disabled:
+            if not is_rsm_update_enabled and isinstance(self._updater, RSMVersionUpdater):
                 msg = "VM not enabled for RSM updates, switching to self-update mode"
                 logger.info(msg)
                 add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
-                self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.now())
+                self._updater = SelfUpdateVersionUpdater(self._gs_id)
                 self._remove_rsm_update_state()
 
-            if updater_mode == RSMUpdates.Enabled:
+            if is_rsm_update_enabled and isinstance(self._updater, SelfUpdateVersionUpdater):
                 msg = "VM enabled for RSM updates, switching to RSM update mode"
                 logger.info(msg)
                 add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
                 self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
                 self._save_rsm_update_state()
 
+            # If updater is changed in previous step, we allow update as it consider as first attempt. If not, it checks below condition
+            # RSM checks new goal state; self-update checks manifest download interval
+            if not self._updater.is_update_allowed_this_time(ext_gs_updated):
+                return
+
             self._updater.retrieve_agent_version(agent_family, goal_state)
 
             if not self._updater.is_retrieved_version_allowed_to_update(agent_family):
@@ -183,14 +191,20 @@ def run(self, goal_state, ext_gs_updated):
             self._updater.proceed_with_update()
 
         except Exception as err:
+            log_error = True
             if isinstance(err, AgentUpgradeExitException):
                 raise err
             elif isinstance(err, AgentUpdateError):
                 error_msg = ustr(err)
+            elif isinstance(err, AgentFamilyMissingError):
+                error_msg = ustr(err)
+                # Agent family missing error is permanent in the given goal state, so we don't want to log it on every iteration of main loop if there is no new goal state
+                log_error = ext_gs_updated
             else:
                 error_msg = "Unable to update Agent: {0}".format(textutil.format_exception(err))
-            logger.warn(error_msg)
-            add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
+            if log_error:
+                logger.warn(error_msg)
+                add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
             self._last_attempted_update_error_msg = error_msg
 
     def get_vmagent_update_status(self):
diff --git a/azurelinuxagent/ga/ga_version_updater.py b/azurelinuxagent/ga/ga_version_updater.py
index 0d3f639f2..46ae1f31f 100644
--- a/azurelinuxagent/ga/ga_version_updater.py
+++ b/azurelinuxagent/ga/ga_version_updater.py
@@ -30,14 +30,6 @@
 from azurelinuxagent.ga.guestagent import GuestAgent
 
 
-class RSMUpdates(object):
-    """
-    Enum for switching between RSM updates and self updates
-    """
-    Enabled = "Enabled"
-    Disabled = "Disabled"
-
-
 class GAVersionUpdater(object):
 
     def __init__(self, gs_id):
@@ -53,15 +45,13 @@ def is_update_allowed_this_time(self, ext_gs_updated):
         """
         raise NotImplementedError
 
-    def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
+    def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
         """
-        checks and raise the updater exception if we need to switch to self-update from rsm update or vice versa
+        return True if we need to switch to RSM-update from self-update and vice versa.
         @param agent_family: agent family
-        @param gs_id: incarnation of the goal state
         @param ext_gs_updated: True if extension goal state updated else False
-        @return: RSMUpdates.Disabled: return when agent need to stop rsm updates and switch to self-update
-                 RSMUpdates.Enabled: return when agent need to switch to rsm update
-                 None: return when no need to switch
+        @return: False when agent need to stop rsm updates
+                 True: when agent need to switch to rsm update
         """
         raise NotImplementedError
 
@@ -107,6 +97,13 @@ def version(self):
         """
         return self._version
 
+    def sync_new_gs_id(self, gs_id):
+        """
+        Update gs_id
+        @param gs_id: goal state id
+        """
+        self._gs_id = gs_id
+
     def download_and_get_new_agent(self, protocol, agent_family, goal_state):
         """
         Function downloads the new agent and returns the downloaded version.
diff --git a/azurelinuxagent/ga/rsm_version_updater.py b/azurelinuxagent/ga/rsm_version_updater.py
index 6df7b6e30..a7a8bd97d 100644
--- a/azurelinuxagent/ga/rsm_version_updater.py
+++ b/azurelinuxagent/ga/rsm_version_updater.py
@@ -24,7 +24,7 @@
 from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
 from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
 from azurelinuxagent.common.version import CURRENT_VERSION, AGENT_NAME
-from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater, RSMUpdates
+from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater
 from azurelinuxagent.ga.guestagent import GuestAgent
 
 
@@ -49,24 +49,23 @@ def is_update_allowed_this_time(self, ext_gs_updated):
         """
         return ext_gs_updated
 
-    def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
+    def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
         """
         Checks if there is a new goal state and decide if we need to continue with rsm update or switch to self-update.
-        Firstly it checks agent supports GA versioning or not. If not, we return rsm updates disabled to switch to self-update.
-        if vm is enabled for RSM updates and continue with rsm update, otherwise we return rsm updates disabled to switch to self-update.
+        Firstly it checks agent supports GA versioning or not. If not, we return false to switch to self-update.
+        if vm is enabled for RSM updates and continue with rsm update, otherwise we return false to switch to self-update.
         if either isVersionFromRSM or isVMEnabledForRSMUpgrades or version is missing in the goal state, we ignore the update as we consider it as invalid goal state.
         """
         if ext_gs_updated:
-            self._gs_id = gs_id
             if not conf.get_enable_ga_versioning():
-                return RSMUpdates.Disabled
+                return False
 
             if agent_family.is_vm_enabled_for_rsm_upgrades is None:
                 raise AgentUpdateError(
                     "Received invalid goal state:{0}, missing isVMEnabledForRSMUpgrades property. So, skipping agent update".format(
                         self._gs_id))
             elif not agent_family.is_vm_enabled_for_rsm_upgrades:
-                return RSMUpdates.Disabled
+                return False
             else:
                 if agent_family.is_version_from_rsm is None:
                     raise AgentUpdateError(
@@ -77,7 +76,7 @@ def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_update
                         "Received invalid goal state:{0}, missing version property. So, skipping agent update".format(
                             self._gs_id))
 
-        return None
+        return True
 
     def retrieve_agent_version(self, agent_family, goal_state):
         """
diff --git a/azurelinuxagent/ga/self_update_version_updater.py b/azurelinuxagent/ga/self_update_version_updater.py
index ca27c4399..6605a28eb 100644
--- a/azurelinuxagent/ga/self_update_version_updater.py
+++ b/azurelinuxagent/ga/self_update_version_updater.py
@@ -23,7 +23,7 @@
 from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
 from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
 from azurelinuxagent.common.version import CURRENT_VERSION
-from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater, RSMUpdates
+from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater
 
 
 class SelfUpdateType(object):
@@ -35,9 +35,9 @@ class SelfUpdateType(object):
 
 
 class SelfUpdateVersionUpdater(GAVersionUpdater):
-    def __init__(self, gs_id, last_attempted_manifest_download_time):
+    def __init__(self, gs_id):
         super(SelfUpdateVersionUpdater, self).__init__(gs_id)
-        self._last_attempted_manifest_download_time = last_attempted_manifest_download_time
+        self._last_attempted_manifest_download_time = datetime.datetime.min
         self._last_attempted_self_update_time = datetime.datetime.min
 
     @staticmethod
@@ -119,14 +119,13 @@ def is_update_allowed_this_time(self, ext_gs_updated):
             return False
         return True
 
-    def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
+    def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
         """
         Checks if there is a new goal state and decide if we need to continue with self-update or switch to rsm update.
-        if vm is not enabled for RSM updates or agent not supports GA versioning then we continue with self update, otherwise we rsm enabled to switch to rsm update.
+        if vm is not enabled for RSM updates or agent not supports GA versioning then we continue with self update, otherwise we return true to switch to rsm update.
         if isVersionFromRSM is missing but isVMEnabledForRSMUpgrades is present in the goal state, we ignore the update as we consider it as invalid goal state.
         """
         if ext_gs_updated:
-            self._gs_id = gs_id
             if conf.get_enable_ga_versioning() and agent_family.is_vm_enabled_for_rsm_upgrades is not None and agent_family.is_vm_enabled_for_rsm_upgrades:
                 if agent_family.is_version_from_rsm is None:
                     raise AgentUpdateError(
@@ -137,9 +136,9 @@ def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_update
                         raise AgentUpdateError(
                             "Received invalid goal state:{0}, missing version property. So, skipping agent update".format(
                                 self._gs_id))
-                    return RSMUpdates.Enabled
+                    return True
 
-        return None
+        return False
 
     def retrieve_agent_version(self, agent_family, goal_state):
         """
diff --git a/tests/ga/test_agent_update_handler.py b/tests/ga/test_agent_update_handler.py
index 0ac373a6b..5ba7f3c70 100644
--- a/tests/ga/test_agent_update_handler.py
+++ b/tests/ga/test_agent_update_handler.py
@@ -366,6 +366,16 @@ def test_handles_missing_agent_family(self):
                                          'message'] and kwarg[
                                          'op'] == WALAEventOperation.AgentUpgrade]), "Agent manifest should not be in GS")
 
+            # making multiple agent update attempts and assert only one time logged
+            agent_update_handler.run(agent_update_handler._protocol.get_goal_state(), False)
+            agent_update_handler.run(agent_update_handler._protocol.get_goal_state(), False)
+
+            self.assertEqual(1, len([kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if
+                                     "No manifest links found for agent family" in kwarg[
+                                         'message'] and kwarg[
+                                         'op'] == WALAEventOperation.AgentUpgrade]),
+                             "Agent manifest error should be logged once if it's same goal state")
+
     def test_it_should_report_update_status_with_success(self):
         data_file = DATA_FILE.copy()
         data_file["ext_conf"] = "wire/ext_conf_rsm_version.xml"
diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py
index 37fb75796..c25585f14 100644
--- a/tests/ga/test_update.py
+++ b/tests/ga/test_update.py
@@ -1784,13 +1784,14 @@ def test_it_should_not_download_anything_if_rsm_version_is_current_version(self)
             self.assertFalse(os.path.exists(self.agent_dir("99999.0.0.0")),
                              "New agent directory should not be found")
 
-    def test_it_should_skip_wait_to_update_if_rsm_version_available(self):
+    def test_it_should_skip_wait_to_update_immediately_if_rsm_version_available(self):
         no_of_iterations = 100
 
         def reload_conf(url, protocol):
             mock_wire_data = protocol.mock_wire_data
 
             # This function reloads the conf mid-run to mimic an actual customer scenario
+            # Setting the rsm request to be sent after some iterations
             if HttpRequestPredicates.is_goal_state_request(url) and mock_wire_data.call_counts["goalstate"] >= 5:
                 reload_conf.call_count += 1
 
@@ -1808,7 +1809,8 @@ def reload_conf(url, protocol):
 
         data_file = wire_protocol_data.DATA_FILE.copy()
         data_file['ga_manifest'] = "wire/ga_manifest_no_upgrade.xml"
-        with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf) as (update_handler, mock_telemetry):
+        # Setting the prod frequency to mimic a real scenario
+        with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf, autoupdate_frequency=6000) as (update_handler, mock_telemetry):
             update_handler._protocol.mock_wire_data.set_ga_manifest_version_version(str(CURRENT_VERSION))
             update_handler._protocol.mock_wire_data.set_incarnation(20)
             update_handler.run(debug=True)

From 5b4166c98f820349d3f5a5904ab2b476b126b2b8 Mon Sep 17 00:00:00 2001
From: Norberto Arrieta <narrieta@users.noreply.github.com>
Date: Tue, 16 Jan 2024 17:59:38 -0800
Subject: [PATCH 06/11] Don't check Agent log from the top after each test
 suite (#3022)

* Don't check Agent log from the top after each test suite

* fix initialization of override

---------

Co-authored-by: narrieta <narrieta>
---
 .../orchestrator/lib/agent_test_suite.py      | 47 +++++++++----------
 1 file changed, 21 insertions(+), 26 deletions(-)

diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py
index 7b2becd59..2a7241d78 100644
--- a/tests_e2e/orchestrator/lib/agent_test_suite.py
+++ b/tests_e2e/orchestrator/lib/agent_test_suite.py
@@ -573,10 +573,12 @@ def _execute(self) -> None:
                                 test_suite_success = False
                                 raise
 
+                        check_log_start_time = datetime.datetime.min
+
                         for suite in self._test_suites:
                             log.info("Executing test suite %s", suite.name)
                             self._lisa_log.info("Executing Test Suite %s", suite.name)
-                            case_success = self._execute_test_suite(suite, test_context)
+                            case_success, check_log_start_time = self._execute_test_suite(suite, test_context, check_log_start_time)
                             test_suite_success = case_success and test_suite_success
                             if not case_success:
                                 failed_cases.append(suite.name)
@@ -611,13 +613,15 @@ def _execute(self) -> None:
                     if not test_suite_success or unexpected_error:
                         raise TestFailedException(self._environment_name, failed_cases)
 
-    def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestContext) -> bool:
+    def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestContext, check_log_start_time: datetime.datetime) -> Tuple[bool, datetime.datetime]:
         """
-        Executes the given test suite and returns True if all the tests in the suite succeeded.
+        Executes the given test suite and returns a tuple of a bool indicating whether all the tests in the suite succeeded, and the timestamp that should be used
+        for the next check of the agent log.
         """
         suite_name = suite.name
         suite_full_name = f"{suite_name}-{self._environment_name}"
         suite_start_time: datetime.datetime = datetime.datetime.now()
+        check_log_start_time_override = datetime.datetime.max  # tests can override the timestamp for the agent log check with the get_ignore_errors_before_timestamp() method
 
         with set_thread_name(suite_full_name):  # The thread name is added to the LISA log
             log_path: Path = self._log_path / f"{suite_full_name}.log"
@@ -631,7 +635,6 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
 
                     summary: List[str] = []
                     ignore_error_rules: List[Dict[str, Any]] = []
-                    before_timestamp = datetime.datetime.min
 
                     for test in suite.tests:
                         test_full_name = f"{suite_name}-{test.name}"
@@ -705,13 +708,11 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
 
                         ignore_error_rules.extend(test_instance.get_ignore_error_rules())
 
-                        # If the test has a timestamp before which errors should be ignored in the agent log, use that timestamp
-                        # if multiple tests have this setting, use the earliest timestamp
-                        if test_instance.get_ignore_errors_before_timestamp() != datetime.datetime.min:
-                            if before_timestamp != datetime.datetime.min:
-                                before_timestamp = min(before_timestamp, test_instance.get_ignore_errors_before_timestamp())
-                            else:
-                                before_timestamp = test_instance.get_ignore_errors_before_timestamp()
+                        # Check if the test is requesting to override the timestamp for the agent log check.
+                        # Note that if multiple tests in the suite provide an override, we'll use the earliest timestamp.
+                        test_check_log_start_time = test_instance.get_ignore_errors_before_timestamp()
+                        if test_check_log_start_time != datetime.datetime.min:
+                            check_log_start_time_override = min(check_log_start_time_override, test_check_log_start_time)
 
                         if not test_success and test.blocks_suite:
                             log.warning("%s failed and blocks the suite. Stopping suite execution.", test.name)
@@ -737,11 +738,12 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
                     if not suite_success:
                         self._mark_log_as_failed()
 
-                suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules, before_timestamp)
+                next_check_log_start_time = datetime.datetime.utcnow()
+                suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules, check_log_start_time_override if check_log_start_time_override != datetime.datetime.max else check_log_start_time)
 
-                return suite_success
+                return suite_success, next_check_log_start_time
 
-    def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]], before_timestamp: datetime) -> bool:
+    def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]], check_log_start_time: datetime.datetime) -> bool:
         """
         Checks the agent log on the test nodes for errors; returns true on success (no errors in the logs)
         """
@@ -759,22 +761,15 @@ def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]
             start_time: datetime.datetime = datetime.datetime.now()
 
             try:
-                self._lisa_log.info("Checking agent log on the test node %s", node_name)
-                log.info("Checking agent log on the test node %s", node_name)
+                message = f"Checking agent log on test node {node_name}, starting at {check_log_start_time.strftime('%Y-%m-%dT%H:%M:%S.%fZ')}"
+                self._lisa_log.info(message)
+                log.info(message)
 
                 output = ssh_client.run_command("check-agent-log.py -j")
                 errors = json.loads(output, object_hook=AgentLogRecord.from_dictionary)
 
-                # Individual tests may have rules to ignore known errors; filter those out
-                if len(ignore_error_rules) > 0:
-                    new = []
-                    for e in errors:
-                        # Ignore errors that occurred before the timestamp
-                        if e.timestamp < before_timestamp:
-                            continue
-                        if not AgentLog.matches_ignore_rule(e, ignore_error_rules):
-                            new.append(e)
-                    errors = new
+                # Filter out errors that occurred before the starting timestamp or that match an ignore rule
+                errors = [e for e in errors if e.timestamp >= check_log_start_time and (len(ignore_error_rules) == 0 or not AgentLog.matches_ignore_rule(e, ignore_error_rules))]
 
                 if len(errors) == 0:
                     # If no errors, we are done; don't create a log or test result.

From bf3738b0c18a9c86af1389f222758d78471df92e Mon Sep 17 00:00:00 2001
From: Zhidong Peng <zpeng@microsoft.com>
Date: Fri, 19 Jan 2024 12:58:23 -0800
Subject: [PATCH 07/11] update the proxy agenet log folder for logcollector
 (#3028)

---
 azurelinuxagent/ga/logcollector_manifests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azurelinuxagent/ga/logcollector_manifests.py b/azurelinuxagent/ga/logcollector_manifests.py
index fc240d30c..3548de4fc 100644
--- a/azurelinuxagent/ga/logcollector_manifests.py
+++ b/azurelinuxagent/ga/logcollector_manifests.py
@@ -122,6 +122,6 @@
 diskinfo,
 
 echo,### Gathering Guest ProxyAgent Log Files ###
-copy,/var/log/proxyagent/*
+copy,/var/log/azure-proxy-agent/*
 echo,
 """

From d8beb1af1840c366b526f988f1240ba33c1abb54 Mon Sep 17 00:00:00 2001
From: maddieford <93676569+maddieford@users.noreply.github.com>
Date: Mon, 22 Jan 2024 11:08:12 -0800
Subject: [PATCH 08/11] Log instance view before asserting (#3029)

---
 tests_e2e/tests/lib/virtual_machine_extension_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests_e2e/tests/lib/virtual_machine_extension_client.py b/tests_e2e/tests/lib/virtual_machine_extension_client.py
index 699ed7cb4..d54f881d0 100644
--- a/tests_e2e/tests/lib/virtual_machine_extension_client.py
+++ b/tests_e2e/tests/lib/virtual_machine_extension_client.py
@@ -135,6 +135,7 @@ def assert_instance_view(
         additional validations.
         """
         instance_view = self.get_instance_view()
+        log.info("Instance view:\n%s", instance_view.serialize())
 
         with soft_assertions():
             if expected_version is not None:

From 3c32d45177a44ca1fb83baffefc6b23780001575 Mon Sep 17 00:00:00 2001
From: Norberto Arrieta <narrieta@users.noreply.github.com>
Date: Thu, 1 Feb 2024 09:47:31 -0800
Subject: [PATCH 09/11] Add config parameter to wait for cloud-init
 (Extensions.WaitForCloudInit) (#3031)

* Add config parameter to wait for cloud-init (Extensions.WaitForCloudInit)

---------

Co-authored-by: narrieta <narrieta>
---
 README.md                                     | 24 +++++
 azurelinuxagent/common/conf.py                | 10 ++
 azurelinuxagent/common/event.py               |  1 +
 azurelinuxagent/common/utils/shellutil.py     | 39 +++++++-
 azurelinuxagent/ga/update.py                  | 20 ++++
 tests/common/test_conf.py                     |  2 +
 tests/common/utils/test_shell_util.py         |  9 +-
 tests/ga/test_update.py                       | 61 ++++++++++++-
 tests/lib/mock_update_handler.py              |  4 +-
 tests/test_agent.py                           |  2 +
 .../lib/agent_test_suite_combinator.py        | 74 +++++++++++----
 tests_e2e/orchestrator/runbook.yml            |  2 +-
 .../test_suites/agent_wait_for_cloud_init.yml | 13 +++
 .../disable_agent_provisioning.py             | 43 ++-------
 .../add_cloud_init_script.py                  | 63 +++++++++++++
 .../agent_wait_for_cloud_init.py              | 91 +++++++++++++++++++
 tests_e2e/tests/lib/update_arm_template.py    | 83 +++++++++++++++++
 17 files changed, 476 insertions(+), 65 deletions(-)
 create mode 100644 tests_e2e/test_suites/agent_wait_for_cloud_init.yml
 create mode 100755 tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py
 create mode 100755 tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py

diff --git a/README.md b/README.md
index 3d3a824e1..6d0296bfc 100644
--- a/README.md
+++ b/README.md
@@ -261,6 +261,30 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
 provisioning time, via whichever API is being used. We will provide more details on
 this on our wiki when it is generally available. 
 
+#### __Extensions.WaitForCloudInit__
+
+_Type: Boolean_  
+_Default: n_
+
+Waits for cloud-init to complete (cloud-init status --wait) before executing VM extensions.
+
+Both cloud-init and VM extensions are common ways to customize a VM during initial deployment. By
+default, the agent will start executing extensions while cloud-init may still be in the 'config' 
+stage and won't wait for the 'final' stage to complete. Cloud-init and extensions may execute operations
+that conflict with each other (for example, both of them may try to install packages). Setting this option
+to 'y' ensures that VM extensions are executed only after cloud-init has completed all its stages.
+
+Note that using this option requires creating a custom image with the value of this option set to 'y', in
+order to ensure that the wait is performed during the initial deployment of the VM.
+
+#### __Extensions.WaitForCloudInitTimeout__
+
+_Type: Integer_  
+_Default: 3600_
+
+Timeout in seconds for the Agent to wait on cloud-init. If the timeout elapses, the Agent will continue 
+executing VM extensions. See Extensions.WaitForCloudInit for more details. 
+
 #### __Extensions.GoalStatePeriod__
 
 _Type: Integer_  
diff --git a/azurelinuxagent/common/conf.py b/azurelinuxagent/common/conf.py
index 57d6c9d28..a13f33357 100644
--- a/azurelinuxagent/common/conf.py
+++ b/azurelinuxagent/common/conf.py
@@ -117,6 +117,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
     "Logs.Console": True,
     "Logs.Collect": True,
     "Extensions.Enabled": True,
+    "Extensions.WaitForCloudInit": False,
     "Provisioning.AllowResetSysUser": False,
     "Provisioning.RegenerateSshHostKeyPair": False,
     "Provisioning.DeleteRootPassword": False,
@@ -170,6 +171,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
 __INTEGER_OPTIONS__ = {
     "Extensions.GoalStatePeriod": 6,
     "Extensions.InitialGoalStatePeriod": 6,
+    "Extensions.WaitForCloudInitTimeout": 3600,
     "OS.EnableFirewallPeriod": 300,
     "OS.RemovePersistentNetRulesPeriod": 30,
     "OS.RootDeviceScsiTimeoutPeriod": 30,
@@ -372,6 +374,14 @@ def get_extensions_enabled(conf=__conf__):
     return conf.get_switch("Extensions.Enabled", True)
 
 
+def get_wait_for_cloud_init(conf=__conf__):
+    return conf.get_switch("Extensions.WaitForCloudInit", False)
+
+
+def get_wait_for_cloud_init_timeout(conf=__conf__):
+    return conf.get_switch("Extensions.WaitForCloudInitTimeout", 3600)
+
+
 def get_goal_state_period(conf=__conf__):
     return conf.get_int("Extensions.GoalStatePeriod", 6)
 
diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py
index fe313968f..b01058380 100644
--- a/azurelinuxagent/common/event.py
+++ b/azurelinuxagent/common/event.py
@@ -75,6 +75,7 @@ class WALAEventOperation:
     CGroupsCleanUp = "CGroupsCleanUp"
     CGroupsDisabled = "CGroupsDisabled"
     CGroupsInfo = "CGroupsInfo"
+    CloudInit = "CloudInit"
     CollectEventErrors = "CollectEventErrors"
     CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
     ConfigurationChange = "ConfigurationChange"
diff --git a/azurelinuxagent/common/utils/shellutil.py b/azurelinuxagent/common/utils/shellutil.py
index 50fd4592f..d2bfd787e 100644
--- a/azurelinuxagent/common/utils/shellutil.py
+++ b/azurelinuxagent/common/utils/shellutil.py
@@ -18,9 +18,17 @@
 #
 import os
 import subprocess
+import sys
 import tempfile
 import threading
 
+if sys.version_info[0] == 2:
+    # TimeoutExpired was introduced on Python 3; define a dummy class for Python 2
+    class TimeoutExpired(Exception):
+        pass
+else:
+    from subprocess import TimeoutExpired
+
 import azurelinuxagent.common.logger as logger
 from azurelinuxagent.common.future import ustr
 
@@ -206,7 +214,7 @@ def __run_command(command_action, command, log_error, encode_output):
 
 
 # W0622: Redefining built-in 'input'  -- disabled: the parameter name mimics subprocess.communicate()
-def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True):  # pylint:disable=W0622
+def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True, timeout=None):  # pylint:disable=W0622
     """
         Executes the given command and returns its stdout.
 
@@ -227,7 +235,9 @@ def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=
              value for these parameters is anything other than the default (subprocess.PIPE)), then the corresponding
              values returned by this function or the CommandError exception will be empty strings.
 
-        Note: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
+        NOTE: The 'timeout' parameter is ignored on Python 2
+
+        NOTE: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
     """
     if input is not None and stdin is not None:
         raise ValueError("The input and stdin arguments are mutually exclusive")
@@ -246,7 +256,30 @@ def command_action():
         else:
             process = subprocess.Popen(command, stdin=popen_stdin, stdout=stdout, stderr=stderr, shell=False)
 
-        command_stdout, command_stderr = process.communicate(input=communicate_input)
+        try:
+            if sys.version_info[0] == 2:  # communicate() doesn't support timeout on Python 2
+                command_stdout, command_stderr = process.communicate(input=communicate_input)
+            else:
+                command_stdout, command_stderr = process.communicate(input=communicate_input, timeout=timeout)
+        except TimeoutExpired:
+            if log_error:
+                logger.error(u"Command [{0}] timed out", __format_command(command))
+
+            command_stdout, command_stderr = '', ''
+
+            try:
+                process.kill()
+                # try to get any output from the command, but ignore any errors if we can't
+                try:
+                    command_stdout, command_stderr = process.communicate()
+                # W0702: No exception type(s) specified (bare-except)
+                except:  # pylint: disable=W0702
+                    pass
+            except Exception as exception:
+                if log_error:
+                    logger.error(u"Can't terminate timed out process: {0}", ustr(exception))
+            raise CommandError(command=__format_command(command), return_code=-1, stdout=command_stdout, stderr="command timeout\n{0}".format(command_stderr))
+
         if track_process:
             _on_command_completed(process.pid)
 
diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py
index 88267b75e..1a0e36240 100644
--- a/azurelinuxagent/ga/update.py
+++ b/azurelinuxagent/ga/update.py
@@ -149,6 +149,8 @@ def __init__(self):
         self._last_check_memory_usage_time = time.time()
         self._check_memory_usage_last_error_report = datetime.min
 
+        self._cloud_init_completed = False  # Only used when Extensions.WaitForCloudInit is enabled; note that this variable is always reset on service start.
+
         # VM Size is reported via the heartbeat, default it here.
         self._vm_size = None
 
@@ -458,6 +460,22 @@ def _initialize_goal_state(self, protocol):
                     logger.info("The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric:    {0}\nFastTrack: {1}",
                         egs.created_on_timestamp, last_fast_track_timestamp)
 
+    def _wait_for_cloud_init(self):
+        if conf.get_wait_for_cloud_init() and not self._cloud_init_completed:
+            message = "Waiting for cloud-init to complete..."
+            logger.info(message)
+            add_event(op=WALAEventOperation.CloudInit, message=message)
+            try:
+                output = shellutil.run_command(["cloud-init", "status", "--wait"], timeout=conf.get_wait_for_cloud_init_timeout())
+                message = "cloud-init completed\n{0}".format(output)
+                logger.info(message)
+                add_event(op=WALAEventOperation.CloudInit, message=message)
+            except Exception as e:
+                message = "An error occurred while waiting for cloud-init; will proceed to execute VM extensions. Extensions that have conflicts with cloud-init may fail.\n{0}".format(ustr(e))
+                logger.error(message)
+                add_event(op=WALAEventOperation.CloudInit, message=message, is_success=False, log_event=False)
+            self._cloud_init_completed = True  # Mark as completed even on error since we will proceed to execute extensions
+
     def _get_vm_size(self, protocol):
         """
         Including VMSize is meant to capture the architecture of the VM (i.e. arm64 VMs will
@@ -562,6 +580,8 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
         # check for agent updates
         agent_update_handler.run(self._goal_state, self._processing_new_extensions_goal_state())
 
+        self._wait_for_cloud_init()
+
         try:
             if self._processing_new_extensions_goal_state():
                 if not self._extensions_summary.converged:
diff --git a/tests/common/test_conf.py b/tests/common/test_conf.py
index 972b289a7..1ae951bf9 100644
--- a/tests/common/test_conf.py
+++ b/tests/common/test_conf.py
@@ -27,6 +27,8 @@ class TestConf(AgentTestCase):
     # -- These values *MUST* match those from data/test_waagent.conf
     EXPECTED_CONFIGURATION = {
         "Extensions.Enabled": True,
+        "Extensions.WaitForCloudInit": False,
+        "Extensions.WaitForCloudInitTimeout": 3600,
         "Provisioning.Agent": "auto",
         "Provisioning.DeleteRootPassword": True,
         "Provisioning.RegenerateSshHostKeyPair": True,
diff --git a/tests/common/utils/test_shell_util.py b/tests/common/utils/test_shell_util.py
index 3c6afc60e..5eb5a83a6 100644
--- a/tests/common/utils/test_shell_util.py
+++ b/tests/common/utils/test_shell_util.py
@@ -18,13 +18,14 @@
 import os
 import signal
 import subprocess
+import sys
 import tempfile
 import threading
 import unittest
 
 from azurelinuxagent.common.future import ustr
 import azurelinuxagent.common.utils.shellutil as shellutil
-from tests.lib.tools import AgentTestCase, patch
+from tests.lib.tools import AgentTestCase, patch, skip_if_predicate_true
 from tests.lib.miscellaneous_tools import wait_for, format_processes
 
 
@@ -225,6 +226,12 @@ def test_run_command_should_raise_an_exception_when_it_cannot_execute_the_comman
         self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
             lambda: shellutil.run_command("nonexistent_command"))
 
+    @skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
+    def test_run_command_should_raise_an_exception_when_the_command_times_out(self):
+        with self.assertRaises(shellutil.CommandError) as context:
+            shellutil.run_command(["sleep", "5"], timeout=1)
+        self.assertIn("command timeout", context.exception.stderr, "The command did not time out")
+
     def test_run_pipe_should_raise_an_exception_when_it_cannot_execute_the_pipe(self):
         self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
             lambda: shellutil.run_pipe([["ls", "-ld", "."], ["nonexistent_command"], ["wc", "-l"]]))
diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py
index c25585f14..aa39ccb55 100644
--- a/tests/ga/test_update.py
+++ b/tests/ga/test_update.py
@@ -38,7 +38,7 @@
     ExtHandlerPackage, ExtHandlerPackageList, Extension, VMStatus, ExtHandlerStatus, ExtensionStatus, \
     VMAgentUpdateStatuses
 from azurelinuxagent.common.protocol.util import ProtocolUtil
-from azurelinuxagent.common.utils import fileutil, textutil, timeutil
+from azurelinuxagent.common.utils import fileutil, textutil, timeutil, shellutil
 from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE
 from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
 from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules
@@ -980,7 +980,6 @@ def match_expected_info():
                         match_unexpected_errors() # Match on errors first, they can provide more info.
                         match_expected_info()
 
-
     def test_it_should_recreate_handler_env_on_service_startup(self):
         iterations = 5
 
@@ -1361,6 +1360,64 @@ def test_it_should_reset_legacy_blacklisted_agents_on_process_start(self):
                 self.assertFalse(agent.is_blacklisted, "Legacy Agent should not be blacklisted")
 
 
+class TestUpdateWaitForCloudInit(AgentTestCase):
+    @staticmethod
+    @contextlib.contextmanager
+    def create_mock_run_command(delay=None):
+        def run_command_mock(cmd, *args, **kwargs):
+            if cmd == ["cloud-init", "status", "--wait"]:
+                if delay is not None:
+                    original_run_command(['sleep', str(delay)], *args, **kwargs)
+                return "cloud-init completed"
+            return original_run_command(cmd, *args, **kwargs)
+        original_run_command = shellutil.run_command
+
+        with patch("azurelinuxagent.ga.update.shellutil.run_command", side_effect=run_command_mock) as run_command_patch:
+            yield run_command_patch
+
+    def test_it_should_not_wait_for_cloud_init_by_default(self):
+        update_handler = UpdateHandler()
+        with self.create_mock_run_command() as run_command_patch:
+            update_handler._wait_for_cloud_init()
+            self.assertTrue(run_command_patch.call_count == 0, "'cloud-init status --wait' should not be called by default")
+
+    def test_it_should_wait_for_cloud_init_when_requested(self):
+        update_handler = UpdateHandler()
+        with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
+            with self.create_mock_run_command() as run_command_patch:
+                update_handler._wait_for_cloud_init()
+                self.assertEqual(1, run_command_patch.call_count, "'cloud-init status --wait' should have be called once")
+
+    @skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
+    def test_it_should_enforce_timeout_waiting_for_cloud_init(self):
+        update_handler = UpdateHandler()
+        with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
+            with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init_timeout", return_value=1):
+                with self.create_mock_run_command(delay=5):
+                    with patch("azurelinuxagent.ga.update.logger.error") as mock_logger:
+                        update_handler._wait_for_cloud_init()
+                    call_args = [args for args, _ in mock_logger.call_args_list if "An error occurred while waiting for cloud-init" in args[0]]
+                    self.assertTrue(
+                        len(call_args) == 1 and len(call_args[0]) == 1 and "command timeout" in call_args[0][0],
+                        "Expected a timeout waiting for cloud-init. Log calls: {0}".format(mock_logger.call_args_list))
+
+    def test_update_handler_should_wait_for_cloud_init_after_agent_update_and_before_extension_processing(self):
+        method_calls = []
+
+        agent_update_handler = Mock()
+        agent_update_handler.run = lambda *_, **__: method_calls.append("AgentUpdateHandler.run()")
+
+        exthandlers_handler = Mock()
+        exthandlers_handler.run = lambda *_, **__: method_calls.append("ExtHandlersHandler.run()")
+
+        with mock_wire_protocol(DATA_FILE) as protocol:
+            with mock_update_handler(protocol, iterations=1, agent_update_handler=agent_update_handler, exthandlers_handler=exthandlers_handler) as update_handler:
+                with patch('azurelinuxagent.ga.update.UpdateHandler._wait_for_cloud_init', side_effect=lambda *_, **__: method_calls.append("UpdateHandler._wait_for_cloud_init()")):
+                    update_handler.run()
+
+        self.assertListEqual(["AgentUpdateHandler.run()", "UpdateHandler._wait_for_cloud_init()", "ExtHandlersHandler.run()"], method_calls, "Wait for cloud-init should happen after agent update and before extension processing")
+
+
 class UpdateHandlerRunTestCase(AgentTestCase):
     def _test_run(self, autoupdate_enabled=False, check_daemon_running=False, expected_exit_code=0, emit_restart_event=None):
         fileutil.write_file(conf.get_agent_pid_file_path(), ustr(42))
diff --git a/tests/lib/mock_update_handler.py b/tests/lib/mock_update_handler.py
index f0b311abe..03d7a4452 100644
--- a/tests/lib/mock_update_handler.py
+++ b/tests/lib/mock_update_handler.py
@@ -86,9 +86,9 @@ def patch_object(target, attribute):
 
     try:
         with patch("azurelinuxagent.ga.exthandlers.get_exthandlers_handler", return_value=exthandlers_handler):
-            with patch("azurelinuxagent.ga.agent_update_handler.get_agent_update_handler", return_value=agent_update_handler):
+            with patch("azurelinuxagent.ga.update.get_agent_update_handler", return_value=agent_update_handler):
                 with patch("azurelinuxagent.ga.remoteaccess.get_remote_access_handler", return_value=remote_access_handler):
-                    with patch("azurelinuxagent.common.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
+                    with patch("azurelinuxagent.ga.update.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
                         with patch.object(UpdateHandler, "is_running", PropertyMock(side_effect=is_running)):
                             with patch('azurelinuxagent.ga.update.time.sleep', side_effect=lambda _: mock_sleep(0.001)) as sleep:
                                 with patch('sys.exit', side_effect=lambda _: 0) as mock_exit:
diff --git a/tests/test_agent.py b/tests/test_agent.py
index 414faa726..0da6a2a85 100644
--- a/tests/test_agent.py
+++ b/tests/test_agent.py
@@ -53,6 +53,8 @@
 Extensions.Enabled = True
 Extensions.GoalStatePeriod = 6
 Extensions.InitialGoalStatePeriod = 6
+Extensions.WaitForCloudInit = False
+Extensions.WaitForCloudInitTimeout = 3600
 HttpProxy.Host = None
 HttpProxy.Port = None
 Lib.Dir = /var/lib/waagent
diff --git a/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py
index fbe53a1bd..4b650e864 100644
--- a/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py
+++ b/tests_e2e/orchestrator/lib/agent_test_suite_combinator.py
@@ -159,15 +159,25 @@ def create_environment_list(self) -> List[Dict[str, Any]]:
             for image in images_info:
                 if image in skip_images_info:
                     continue
-                # 'image.urn' can actually be the URL to a VHD if the runbook provided it in the 'image' parameter
+                # 'image.urn' can actually be the URL to a VHD or an image from a gallery if the runbook provided it in the 'image' parameter
                 if self._is_vhd(image.urn):
                     marketplace_image = ""
                     vhd = image.urn
                     image_name = urllib.parse.urlparse(vhd).path.split('/')[-1]  # take the last fragment of the URL's path (e.g. "RHEL_8_Standard-8.3.202006170423.vhd")
+                    shared_gallery = ""
+                elif self._is_image_from_gallery(image.urn):
+                    marketplace_image = ""
+                    vhd = ""
+                    image_name = self._get_name_of_image_from_gallery(image.urn)
+                    shared_gallery = image.urn
                 else:
                     marketplace_image = image.urn
                     vhd = ""
                     image_name = self._get_image_name(image.urn)
+                    shared_gallery = ""
+
+                if test_suite_info.executes_on_scale_set and (vhd != "" or shared_gallery != ""):
+                    raise Exception("VHDS and images from galleries are currently not supported on scale sets.")
 
                 location: str = self._get_location(test_suite_info, image)
                 if location is None:
@@ -194,6 +204,7 @@ def create_environment_list(self) -> List[Dict[str, Any]]:
                             env_name=f"{image_name}-{test_suite_info.name}",
                             marketplace_image=marketplace_image,
                             vhd=vhd,
+                            shared_gallery=shared_gallery,
                             location=location,
                             vm_size=vm_size,
                             test_suite_info=test_suite_info)
@@ -206,9 +217,6 @@ def create_environment_list(self) -> List[Dict[str, Any]]:
                         env["c_test_suites"].append(test_suite_info)
                     else:
                         if test_suite_info.executes_on_scale_set:
-                            # TODO: Add support for VHDs
-                            if vhd != "":
-                                raise Exception("VHDS are currently not supported on scale sets.")
                             env = self.create_vmss_environment(
                                 env_name=env_name,
                                 marketplace_image=marketplace_image,
@@ -220,18 +228,18 @@ def create_environment_list(self) -> List[Dict[str, Any]]:
                                 env_name=env_name,
                                 marketplace_image=marketplace_image,
                                 vhd=vhd,
+                                shared_gallery=shared_gallery,
                                 location=location,
                                 vm_size=vm_size,
                                 test_suite_info=test_suite_info)
                         shared_environments[env_name] = env
 
-                    if test_suite_info.template != '':
-                        vm_tags = env.get("vm_tags")
-                        if vm_tags is not None:
-                            if "templates" not in vm_tags:
-                                vm_tags["templates"] = test_suite_info.template
-                            else:
-                                vm_tags["templates"] += "," + test_suite_info.template
+                if test_suite_info.template != '':
+                    vm_tags = env["vm_tags"]
+                    if "templates" not in vm_tags:
+                        vm_tags["templates"] = test_suite_info.template
+                    else:
+                        vm_tags["templates"] += "," + test_suite_info.template
 
         environments.extend(shared_environments.values())
 
@@ -330,7 +338,7 @@ def create_existing_vmss_environment(self) -> Dict[str, Any]:
             "c_test_suites": loader.test_suites,
         }
 
-    def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]:
+    def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str, shared_gallery: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]:
         #
         # Custom ARM templates (to create the test VMs) require special handling. These templates are processed by the azure_update_arm_template
         # hook, which does not have access to the runbook variables. Instead, we use a dummy VM tag named "templates" and pass the
@@ -339,11 +347,9 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str,
         # share the same test environment. Similarly, we use a dummy VM tag named "allow_ssh" to pass the value of the "allow_ssh" runbook parameter.
         #
         vm_tags = {}
-        if test_suite_info.template != '':
-            vm_tags["templates"] = test_suite_info.template
         if self.runbook.allow_ssh != '':
             vm_tags["allow_ssh"] = self.runbook.allow_ssh
-        return {
+        environment = {
             "c_platform": [
                 {
                     "type": "azure",
@@ -366,6 +372,7 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str,
                         "azure": {
                             "marketplace": marketplace_image,
                             "vhd": vhd,
+                            "shared_gallery": shared_gallery,
                             "location": location,
                             "vm_size": vm_size
                         }
@@ -383,6 +390,18 @@ def create_vm_environment(self, env_name: str, marketplace_image: str, vhd: str,
             "vm_tags": vm_tags
         }
 
+        if shared_gallery != '':
+            # Currently all the images in our shared gallery require secure boot
+            environment['c_platform'][0]['requirement']["features"] = {
+                "items": [
+                    {
+                        "type": "Security_Profile",
+                        "security_profile": "secureboot"
+                    }
+                ]
+            }
+        return environment
+
     def create_vmss_environment(self, env_name: str, marketplace_image: str, location: str, vm_size: str, test_suite_info: TestSuiteInfo) -> Dict[str, Any]:
         return {
             "c_platform": [
@@ -406,7 +425,8 @@ def create_vmss_environment(self, env_name: str, marketplace_image: str, locatio
             "c_location": location,
             "c_image": marketplace_image,
             "c_is_vhd": False,
-            "c_vm_size": vm_size
+            "c_vm_size": vm_size,
+            "vm_tags": {}
         }
 
     def _get_runbook_images(self, loader: AgentTestLoader) -> List[VmImageInfo]:
@@ -420,12 +440,12 @@ def _get_runbook_images(self, loader: AgentTestLoader) -> List[VmImageInfo]:
         if images is not None:
             return images
 
-        # If it is not image or image set, it must be a URN or VHD
-        if not self._is_urn(self.runbook.image) and not self._is_vhd(self.runbook.image):
-            raise Exception(f"The 'image' parameter must be an image, an image set name, a urn, or a vhd: {self.runbook.image}")
+        # If it is not image or image set, it must be a URN, VHD, or an image from a gallery
+        if not self._is_urn(self.runbook.image) and not self._is_vhd(self.runbook.image) and not self._is_image_from_gallery(self.runbook.image):
+            raise Exception(f"The 'image' parameter must be an image, image set name, urn, vhd, or an image from a shared gallery: {self.runbook.image}")
 
         i = VmImageInfo()
-        i.urn = self.runbook.image  # Note that this could be a URN or the URI for a VHD
+        i.urn = self.runbook.image  # Note that this could be a URN or the URI for a VHD, or an image from a shared gallery
         i.locations = []
         i.vm_sizes = []
 
@@ -536,6 +556,20 @@ def _is_vhd(vhd: str) -> bool:
         parsed = urllib.parse.urlparse(vhd)
         return parsed.scheme == 'https' and parsed.netloc != "" and parsed.path != ""
 
+    # Images from a gallery are given as  "<image_gallery>/<image_definition>/<image_version>".
+    _IMAGE_FROM_GALLERY = re.compile(r"(?P<gallery>[^/]+)/(?P<image>[^/]+)/(?P<version>[^/]+)")
+
+    @staticmethod
+    def _is_image_from_gallery(image: str) -> bool:
+        return AgentTestSuitesCombinator._IMAGE_FROM_GALLERY.match(image) is not None
+
+    @staticmethod
+    def _get_name_of_image_from_gallery(image: str) -> bool:
+        match = AgentTestSuitesCombinator._IMAGE_FROM_GALLERY.match(image)
+        if match is None:
+            raise Exception(f"Invalid image from gallery: {image}")
+        return match.group('image')
+
     @staticmethod
     def _report_test_result(
             suite_name: str,
diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml
index 9181e9189..ed0b816b1 100644
--- a/tests_e2e/orchestrator/runbook.yml
+++ b/tests_e2e/orchestrator/runbook.yml
@@ -29,7 +29,7 @@ variable:
   # Test suites to execute
   #
   - name: test_suites
-    value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update"
+    value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update, agent_wait_for_cloud_init"
 
   #
   # Parameters used to create test VMs
diff --git a/tests_e2e/test_suites/agent_wait_for_cloud_init.yml b/tests_e2e/test_suites/agent_wait_for_cloud_init.yml
new file mode 100644
index 000000000..727803811
--- /dev/null
+++ b/tests_e2e/test_suites/agent_wait_for_cloud_init.yml
@@ -0,0 +1,13 @@
+#
+# This test verifies that the Agent waits for cloud-init to complete before it starts processing extensions.
+#
+# NOTE: This test is not fully automated. It requires a custom image where the test Agent has been installed and Extensions.WaitForCloudInit is enabled in waagent.conf.
+#       To execute it manually, create a custom image and use the 'image' runbook parameter, for example: "-v: image:gallery/wait-cloud-init/1.0.1".
+#
+name: "AgentWaitForCloudInit"
+tests:
+  - "agent_wait_for_cloud_init/agent_wait_for_cloud_init.py"
+template: "agent_wait_for_cloud_init/add_cloud_init_script.py"
+install_test_agent: false
+# Dummy image, since the parameter is required. The actual image needs to be passed as a parameter to the runbook.
+images: "ubuntu_2204"
diff --git a/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py b/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py
index 6f0a562cd..af3bc738a 100755
--- a/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py
+++ b/tests_e2e/tests/agent_not_provisioned/disable_agent_provisioning.py
@@ -32,18 +32,11 @@ def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None:
 
         #
         # NOTE: LISA's template uses this function to generate the value for osProfile.linuxConfiguration. The function is
-        #       under the 'lisa' namespace.
+        #       under the 'lisa' namespace. We set 'provisionVMAgent' to False.
         #
         #     "getLinuxConfiguration": {
         #         "parameters": [
-        #             {
-        #                 "name": "keyPath",
-        #                 "type": "string"
-        #             },
-        #             {
-        #                 "name": "publicKeyData",
-        #                 "type": "string"
-        #             }
+        #             ...
         #         ],
         #         "output": {
         #             "type": "object",
@@ -62,31 +55,9 @@ def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None:
         #         }
         #     }
         #
-        # The code below sets template['functions'][i]['members']['getLinuxConfiguration']['output']['value']['provisionVMAgent'] to True,
-        # where template['functions'][i] is the 'lisa' namespace.
-        #
-        functions = template.get("functions")
-        if functions is None:
-            raise Exception('Cannot find "functions" in the LISA template.')
-        for namespace in functions:
-            name = namespace.get("namespace")
-            if name is None:
-                raise Exception(f'Cannot find "namespace" in the LISA template: {namespace}')
-            if name == "lisa":
-                members = namespace.get('members')
-                if members is None:
-                    raise Exception(f'Cannot find the members of the lisa namespace in the LISA template: {namespace}')
-                get_linux_configuration = members.get('getLinuxConfiguration')
-                if get_linux_configuration is None:
-                    raise Exception(f'Cannot find the "getLinuxConfiguration" function the lisa namespace in the LISA template: {namespace}')
-                output = get_linux_configuration.get('output')
-                if output is None:
-                    raise Exception(f'Cannot find the "output" of the getLinuxConfiguration function in the LISA template: {get_linux_configuration}')
-                value = output.get('value')
-                if value is None:
-                    raise Exception(f"Cannot find the output's value of the getLinuxConfiguration function in the LISA template: {get_linux_configuration}")
-                value['provisionVMAgent'] = False
-                break
-        else:
-            raise Exception(f'Cannot find the "lisa" namespace in the LISA template: {functions}')
+        get_linux_configuration = self.get_lisa_function(template, 'getLinuxConfiguration')
+        output = self.get_function_output(get_linux_configuration)
+        if output.get('customData') is not None:
+            raise Exception(f"The getOSProfile function already has a 'customData'. Won't override it. Definition: {get_linux_configuration}")
+        output['provisionVMAgent'] = False
 
diff --git a/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py b/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py
new file mode 100755
index 000000000..1fbc60adc
--- /dev/null
+++ b/tests_e2e/tests/agent_wait_for_cloud_init/add_cloud_init_script.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import base64
+
+from typing import Any, Dict
+
+from tests_e2e.tests.agent_wait_for_cloud_init.agent_wait_for_cloud_init import AgentWaitForCloudInit
+from tests_e2e.tests.lib.update_arm_template import UpdateArmTemplate
+
+
+class AddCloudInitScript(UpdateArmTemplate):
+    """
+    Adds AgentWaitForCloudInit.CloudInitScript to the ARM template as osProfile.customData.
+    """
+    def update(self, template: Dict[str, Any], is_lisa_template: bool) -> None:
+        if not is_lisa_template:
+            raise Exception('This test can only customize LISA ARM templates.')
+
+        #
+        # cloud-init configuration needs to be added in the osProfile.customData property as a base64-encoded string.
+        #
+        # LISA uses the getOSProfile function to generate the value for osProfile; add customData to its output, checking that we do not
+        # override any existing value (the current LISA template does not have any).
+        #
+        #    "getOSProfile": {
+        #        "parameters": [
+        #            ...
+        #        ],
+        #        "output": {
+        #            "type": "object",
+        #            "value": {
+        #                "computername": "[parameters('computername')]",
+        #                "adminUsername": "[parameters('admin_username')]",
+        #                "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]",
+        #                "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]"
+        #            }
+        #        }
+        #    }
+        #
+        encoded_script = base64.b64encode(AgentWaitForCloudInit.CloudInitScript.encode('utf-8')).decode('utf-8')
+
+        get_os_profile = self.get_lisa_function(template, 'getOSProfile')
+        output = self.get_function_output(get_os_profile)
+        if output.get('customData') is not None:
+            raise Exception(f"The getOSProfile function already has a 'customData'. Won't override it. Definition: {get_os_profile}")
+        output['customData'] = encoded_script
+
diff --git a/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py b/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py
new file mode 100755
index 000000000..d9b4ecaef
--- /dev/null
+++ b/tests_e2e/tests/agent_wait_for_cloud_init/agent_wait_for_cloud_init.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+
+from assertpy import fail
+
+from tests_e2e.tests.lib.agent_test import AgentVmTest
+from tests_e2e.tests.lib.logging import log
+from tests_e2e.tests.lib.shell import CommandError
+from tests_e2e.tests.lib.ssh_client import SshClient
+
+
+class AgentWaitForCloudInit(AgentVmTest):
+    """
+    This test verifies that the Agent waits for cloud-init to complete before it starts processing extensions.
+
+    To do this, it adds 'CloudInitScript' in cloud-init's custom data. The script ensures first that the Agent
+    is waiting for cloud-init, and then sleeps for a couple of minutes before completing. The scripts appends
+    a set of known messages to waagent.log, and the test simply verifies that the messages are present in the
+    log in the expected order, and that they occur before the Agent reports that it is processing extensions.
+    """
+    CloudInitScript = """#!/usr/bin/env bash
+        set -euox pipefail
+    
+        echo ">>> $(date) cloud-init script begin" >> /var/log/waagent.log
+        while ! grep 'Waiting for cloud-init to complete' /var/log/waagent.log; do
+            sleep 15
+        done
+        echo ">>> $(date) The Agent is waiting for cloud-init, will pause for a couple of minutes" >> /var/log/waagent.log
+        sleep 120
+        echo ">>> $(date) cloud-init script end" >> /var/log/waagent.log
+    """
+
+    def run(self):
+        ssh_client: SshClient = self._context.create_ssh_client()
+
+        log.info("Waiting for Agent to start processing extensions")
+        for _ in range(15):
+            try:
+                ssh_client.run_command("grep 'ProcessExtensionsGoalState started' /var/log/waagent.log")
+                break
+            except CommandError:
+                log.info("The Agent has not started to process extensions, will check again after a short delay")
+                time.sleep(60)
+        else:
+            raise Exception("Timeout while waiting for the Agent to start processing extensions")
+
+        log.info("The Agent has started to process extensions")
+
+        output = ssh_client.run_command(
+            "grep -E '^>>>|" +
+                "INFO ExtHandler ExtHandler cloud-init completed|" +
+                "INFO ExtHandler ExtHandler ProcessExtensionsGoalState started' /var/log/waagent.log")
+
+        output = output.rstrip().splitlines()
+
+        expected = [
+            'cloud-init script begin',
+            'The Agent is waiting for cloud-init, will pause for a couple of minutes',
+            'cloud-init script end',
+            'cloud-init completed',
+            'ProcessExtensionsGoalState started'
+        ]
+
+        indent = lambda lines: "\n".join([f"        {ln}" for ln in lines])
+        if len(output) == len(expected) and all([expected[i] in output[i] for i in range(len(expected))]):
+            log.info("The Agent waited for cloud-init before processing extensions.\nLog messages:\n%s", indent(output))
+        else:
+            fail(f"The Agent did not wait for cloud-init before processing extensions.\nExpected:\n{indent(expected)}\nActual:\n{indent(output)}")
+
+
+if __name__ == "__main__":
+    AgentWaitForCloudInit.run_from_command_line()
+
diff --git a/tests_e2e/tests/lib/update_arm_template.py b/tests_e2e/tests/lib/update_arm_template.py
index af69fba04..010178ab9 100644
--- a/tests_e2e/tests/lib/update_arm_template.py
+++ b/tests_e2e/tests/lib/update_arm_template.py
@@ -55,4 +55,87 @@ def get_resource_by_name(resources: List[Dict[str, Any]], resource_name: str, ty
                 return item
         raise KeyError(f"Cannot find a resource {resource_name} of type {type_name} in the ARM template")
 
+    @staticmethod
+    def get_lisa_function(template: Dict[str, Any], function_name: str) -> Dict[str, Any]:
+        """
+        Looks for the given function name in the LISA namespace and returns its definition. Raises KeyError if the function is not found.
+        """
+        #
+        # NOTE: LISA's functions are in the "lisa" namespace, for example:
+        #
+        # "functions": [
+        #     {
+        #         "namespace": "lisa",
+        #         "members": {
+        #             "getOSProfile": {
+        #                 "parameters": [
+        #                     {
+        #                         "name": "computername",
+        #                         "type": "string"
+        #                     },
+        #                     etc.
+        #                 ],
+        #                 "output": {
+        #                     "type": "object",
+        #                     "value": {
+        #                         "computername": "[parameters('computername')]",
+        #                         "adminUsername": "[parameters('admin_username')]",
+        #                         "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]",
+        #                         "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]"
+        #                     }
+        #                 }
+        #             },
+        #         }
+        #     }
+        # ]
+        functions = template.get("functions")
+        if functions is None:
+            raise Exception('Cannot find "functions" in the LISA template.')
 
+        for namespace in functions:
+            name = namespace.get("namespace")
+            if name is None:
+                raise Exception(f'Cannot find "namespace" in the LISA template: {namespace}')
+            if name == "lisa":
+                lisa_functions = namespace.get('members')
+                if lisa_functions is None:
+                    raise Exception(f'Cannot find the members of the lisa namespace in the LISA template: {namespace}')
+                function_definition = lisa_functions.get(function_name)
+                if function_definition is None:
+                    raise KeyError(f'Cannot find function {function_name} in the lisa namespace in the LISA template: {namespace}')
+                return function_definition
+        raise Exception(f'Cannot find the "lisa" namespace in the LISA template: {functions}')
+
+    @staticmethod
+    def get_function_output(function: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Returns the "value" property of the output for the given function.
+
+        Sample function:
+
+            {
+                "parameters": [
+                    {
+                        "name": "computername",
+                        "type": "string"
+                    },
+                    etc.
+                ],
+                "output": {
+                    "type": "object",
+                    "value": {
+                        "computername": "[parameters('computername')]",
+                        "adminUsername": "[parameters('admin_username')]",
+                        "adminPassword": "[if(parameters('has_password'), parameters('admin_password'), json('null'))]",
+                        "linuxConfiguration": "[if(parameters('has_linux_configuration'), parameters('linux_configuration'), json('null'))]"
+                    }
+                }
+            }
+        """
+        output = function.get('output')
+        if output is None:
+            raise Exception(f'Cannot find the "output" of the given function: {function}')
+        value = output.get('value')
+        if value is None:
+            raise Exception(f"Cannot find the output's value of the given function: {function}")
+        return value

From cc689f5b8a7c51385e5fa3bd4859500147b5d0cf Mon Sep 17 00:00:00 2001
From: maddieford <93676569+maddieford@users.noreply.github.com>
Date: Fri, 2 Feb 2024 11:34:24 -0800
Subject: [PATCH 10/11] Revert changes to publish_hostname in
 RedhatOSModernUtil (#3032)

* Revert changes to publish_hostname in RedhatOSModernUtil

* Fix pylint bad-super-call
---
 azurelinuxagent/common/osutil/redhat.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/azurelinuxagent/common/osutil/redhat.py b/azurelinuxagent/common/osutil/redhat.py
index 05a4b659d..2d8ff3d1e 100644
--- a/azurelinuxagent/common/osutil/redhat.py
+++ b/azurelinuxagent/common/osutil/redhat.py
@@ -192,3 +192,10 @@ def restart_if(self, ifname, retries=3, wait=5):
                 time.sleep(wait)
             else:
                 logger.warn("exceeded restart retries")
+
+    def publish_hostname(self, hostname):
+        # RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
+        # NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
+        # NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.
+        shellutil.run("service NetworkManager restart")
+        DefaultOSUtil.publish_hostname(self, hostname)

From 20f06702462cf56cb6a96e6ec866deccb92f1cd3 Mon Sep 17 00:00:00 2001
From: Norberto Arrieta <narrieta@users.noreply.github.com>
Date: Fri, 2 Feb 2024 14:36:30 -0800
Subject: [PATCH 11/11] Remove agent_wait_for_cloud_init from automated runs
 (#3034)

Co-authored-by: narrieta <narrieta>
---
 tests_e2e/orchestrator/runbook.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml
index ed0b816b1..9181e9189 100644
--- a/tests_e2e/orchestrator/runbook.yml
+++ b/tests_e2e/orchestrator/runbook.yml
@@ -29,7 +29,7 @@ variable:
   # Test suites to execute
   #
   - name: test_suites
-    value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update, agent_wait_for_cloud_init"
+    value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname, agent_update"
 
   #
   # Parameters used to create test VMs