Skip to content

Commit

Permalink
Agent status scenario (Azure#2875)
Browse files Browse the repository at this point in the history
* Update version to dummy 1.0.0.0'

* Revert version change

* Create files for agent status scenario

* Add agent status test logic

* fix pylint error

* Add comment for retry

* Mark failures as exceptions

* Improve messages in logs

* Improve comments

* Update comments

* Check that agent status updates without processing additional goal states 3 times

* Remove unused agent status exception

* Update comment

* Clean up comments, logs, and imports

* Exception should inherit from baseexception

* Import datetime

* Import datetime

* Import timedelta

* instance view time is already formatted

* Increse status update time

* Increse status update time

* Increse status update time

* Increase timeout

* Update comments and timeoutS

* Allow retry if agent status timestamp isn't updated after 30s

* Remove unused import

* Update time value in comment

* address PR comments

* Check if properties are None

* Make types & errors more readable

* Re-use vm_agent variable

* Add comment for dot operator
  • Loading branch information
maddieford authored and nagworld9 committed Aug 30, 2023
1 parent 2a5d686 commit a7b0af9
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 1 deletion.
2 changes: 1 addition & 1 deletion tests_e2e/orchestrator/runbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ variable:
#
# The test suites to execute
- name: test_suites
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update"
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update, agent_status"
- name: cloud
value: "AzureCloud"
is_case_visible: true
Expand Down
9 changes: 9 additions & 0 deletions tests_e2e/test_suites/agent_status.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#
# This scenario validates the agent status is updated without any goal state changes
#
name: "AgentStatus"
tests:
- "agent_status/agent_status.py"
images:
- "endorsed"
- "endorsed-arm64"
195 changes: 195 additions & 0 deletions tests_e2e/tests/agent_status/agent_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Validates the agent status is updated without processing additional goal states (aside from the first goal state
# from fabric)
#

from azure.mgmt.compute.models import VirtualMachineInstanceView, InstanceViewStatus, VirtualMachineAgentInstanceView
from assertpy import assert_that
from datetime import datetime, timedelta
from time import sleep
import json

from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.agent_test_context import AgentTestContext
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.virtual_machine_client import VirtualMachineClient


class RetryableAgentStatusException(BaseException):
pass


class AgentStatus(AgentTest):
def __init__(self, context: AgentTestContext):
super().__init__(context)
self._ssh_client = self._context.create_ssh_client()

def validate_instance_view_vmagent_status(self, instance_view: VirtualMachineInstanceView):
status: InstanceViewStatus = instance_view.vm_agent.statuses[0]

# Validate message field
if status.message is None:
raise RetryableAgentStatusException("Agent status is invalid: 'message' property in instance view is None")
elif 'unresponsive' in status.message:
raise RetryableAgentStatusException("Agent status is invalid: Instance view shows unresponsive agent")

# Validate display status field
if status.display_status is None:
raise RetryableAgentStatusException("Agent status is invalid: 'display_status' property in instance view is None")
elif 'Not Ready' in status.display_status:
raise RetryableAgentStatusException("Agent status is invalid: Instance view shows agent status is not ready")

# Validate time field
if status.time is None:
raise RetryableAgentStatusException("Agent status is invalid: 'time' property in instance view is None")

def validate_instance_view_vmagent(self, instance_view: VirtualMachineInstanceView):
"""
Checks that instance view has vm_agent.statuses and vm_agent.vm_agent_version properties which report the Guest
Agent as running and Ready:
"vm_agent": {
"extension_handlers": [],
"vm_agent_version": "9.9.9.9",
"statuses": [
{
"level": "Info",
"time": "2023-08-11T09:13:01.000Z",
"message": "Guest Agent is running",
"code": "ProvisioningState/succeeded",
"display_status": "Ready"
}
]
}
"""
# Using dot operator for properties here because azure.mgmt.compute.models has classes for InstanceViewStatus
# and VirtualMachineAgentInstanceView. All the properties we validate are attributes of these classes and
# initialized to None
if instance_view.vm_agent is None:
raise RetryableAgentStatusException("Agent status is invalid: 'vm_agent' property in instance view is None")

# Validate vm_agent_version field
vm_agent: VirtualMachineAgentInstanceView = instance_view.vm_agent
if vm_agent.vm_agent_version is None:
raise RetryableAgentStatusException("Agent status is invalid: 'vm_agent_version' property in instance view is None")
elif 'Unknown' in vm_agent.vm_agent_version:
raise RetryableAgentStatusException("Agent status is invalid: Instance view shows agent version is unknown")

# Validate statuses field
if vm_agent.statuses is None:
raise RetryableAgentStatusException("Agent status is invalid: 'statuses' property in instance view is None")
elif len(instance_view.vm_agent.statuses) < 1:
raise RetryableAgentStatusException("Agent status is invalid: Instance view is missing an agent status entry")
else:
self.validate_instance_view_vmagent_status(instance_view=instance_view)

log.info("Instance view has valid agent status, agent version: {0}, status: {1}"
.format(vm_agent.vm_agent_version, vm_agent.statuses[0].display_status))

def check_status_updated(self, status_timestamp: datetime, prev_status_timestamp: datetime, gs_processed_log: str, prev_gs_processed_log: str):
log.info("")
log.info("Check that the agent status updated without processing any additional goal states...")

# If prev_ variables are not updated, then this is the first reported agent status
if prev_status_timestamp is not None and prev_gs_processed_log is not None:
# The agent status timestamp should be greater than the prev timestamp
if status_timestamp > prev_status_timestamp:
log.info(
"Current agent status timestamp {0} is greater than previous status timestamp {1}"
.format(status_timestamp, prev_status_timestamp))
else:
raise RetryableAgentStatusException("Agent status failed to update: Current agent status timestamp {0} "
"is not greater than previous status timestamp {1}"
.format(status_timestamp, prev_status_timestamp))

# The last goal state processed in the agent log should be the same as before
if prev_gs_processed_log == gs_processed_log:
log.info(
"The last processed goal state is the same as the last processed goal state in the last agent "
"status update: \n{0}".format(gs_processed_log)
.format(status_timestamp, prev_status_timestamp))
else:
raise Exception("Agent status failed to update without additional goal state: The agent processed an "
"additional goal state since the last agent status update. \n{0}"
"".format(gs_processed_log))

log.info("")
log.info("The agent status successfully updated without additional goal states")

def run(self):
log.info("")
log.info("*******Verifying the agent status updates 3 times*******")

vm = VirtualMachineClient(self._context.vm)

timeout = datetime.now() + timedelta(minutes=6)
instance_view_exception = None
status_updated = 0
prev_status_timestamp = None
prev_gs_processed_log = None

# Retry validating agent status updates 2 times with timeout of 6 minutes
while datetime.now() <= timeout and status_updated < 2:
instance_view = vm.get_instance_view()
log.info("")
log.info(
"Check instance view to validate that the Guest Agent reports valid status...")
log.info("Instance view of VM is:\n%s", json.dumps(instance_view.serialize(), indent=2))

try:
# Validate the guest agent reports valid status
self.validate_instance_view_vmagent(instance_view)

status_timestamp = instance_view.vm_agent.statuses[0].time
gs_processed_log = self._ssh_client.run_command(
"agent_status-get_last_gs_processed.py", use_sudo=True)

self.check_status_updated(status_timestamp, prev_status_timestamp, gs_processed_log, prev_gs_processed_log)

# Update variables with timestamps for this update
status_updated += 1
prev_status_timestamp = status_timestamp
prev_gs_processed_log = gs_processed_log

# Sleep 30s to allow agent status to update before we check again
sleep(30)

except RetryableAgentStatusException as e:
instance_view_exception = str(e)
log.info("")
log.info(instance_view_exception)
log.info("Waiting 30s before retry...")
sleep(30)

# If status_updated is 0, we know the agent status in the instance view was never valid
log.info("")
assert_that(status_updated > 0).described_as(
"Timeout has expired, instance view has invalid agent status: {0}".format(
instance_view_exception)).is_true()

# Fail the test if we weren't able to validate the agent status updated 3 times
assert_that(status_updated == 2).described_as(
"Timeout has expired, the agent status failed to update 2 times").is_true()


if __name__ == "__main__":
AgentStatus.run_from_command_line()
47 changes: 47 additions & 0 deletions tests_e2e/tests/scripts/agent_status-get_last_gs_processed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Writes the last goal state processed line in the log to stdout
#
import re
import sys

from tests_e2e.tests.lib.agent_log import AgentLog


def main():
gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[[a-z_\d]{13,14}\s\d+\sms\]"
last_gs_processed = None
agent_log = AgentLog()

try:
for agent_record in agent_log.read():
gs_complete = re.match(gs_completed_regex, agent_record.message)

if gs_complete is not None:
last_gs_processed = agent_record.text

except IOError as e:
print("Unable to get last goal state processed: {0}".format(str(e)))

print(last_gs_processed)
sys.exit(0)


if __name__ == "__main__":
main()

0 comments on commit a7b0af9

Please sign in to comment.