From 4be43068c22b6fb815eb3a53bdb5f3b476ec6456 Mon Sep 17 00:00:00 2001 From: Aravind Mani <53524901+aravindmani-1@users.noreply.github.com> Date: Wed, 28 Apr 2021 10:47:35 -0700 Subject: [PATCH 01/17] [xcvrd] Enhance Media Settings (#177) #### Description 1. Currently in sonic, for 400G pre-emphasis settings is not programmed via common methodology. We use Vendor Name + PN to program which would consume memory and never ending process as we need to add Vendor name + PN for every optic. 2. For 100G/40G optics, specification compliance is not displayed. Only for DAC, the specification compliance was displayed. #### Motivation and Context 1. With this PR, we can program the pre-emphasis settings for both DAC,AOC and optics. QSFP_DD parser doesn't have specification compliance which was need to program pre-emphasis settings. The platform API code can be changed to return the type_of_media_interface for 400G media specification compliance, so that the type of media can be determined. https://github.com/Azure/sonic-platform-common/blob/a95834b65a9f3b17ab1ce4e1ba5d1a60102e4507/sonic_platform_base/sonic_sfp/sff8024.py#L104 2. To address 100G/40G optic, introduced a new key "QSFP28 - *" / "QSFP+ - *" (type_abbrv_name followed by a hyphen) . The same key can be defined in vendor specific media_settings.json. These changes doesn't modify the existing behavior but additionally addresses the above mentioned issues. Vendors can still add "Vendor_name + PN" in media_settings.json to program media settings if needed. --- sonic-xcvrd/tests/test_xcvrd.py | 2 +- sonic-xcvrd/xcvrd/xcvrd.py | 35 ++++++++++++++++++++------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index a4d64b68d..11f759f01 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -316,5 +316,5 @@ def test_get_media_settings_key(self): # Test a bad 'specification_compliance' value xcvr_info_dict[0]['specification_compliance'] = 'N/A' result = get_media_settings_key(0, xcvr_info_dict) - assert result == ['MOLEX-1064141421', 'QSFP+'] + assert result == ['MOLEX-1064141421', 'QSFP+-*'] # TODO: Ensure that error message was logged diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 0ae0a0cfd..7b84b5794 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -638,29 +638,36 @@ def get_media_settings_key(physical_port, transceiver_dict): media_len = transceiver_dict[physical_port]['cable_length'] media_compliance_dict_str = transceiver_dict[physical_port]['specification_compliance'] - + media_compliance_code = '' + media_type = '' + media_key = '' media_compliance_dict = {} + try: - media_compliance_dict = ast.literal_eval(media_compliance_dict_str) + if _wrapper_get_sfp_type(physical_port) == 'QSFP_DD': + media_compliance_code = media_compliance_dict_str + else: + media_compliance_dict = ast.literal_eval(media_compliance_dict_str) + if sup_compliance_str in media_compliance_dict: + media_compliance_code = media_compliance_dict[sup_compliance_str] except ValueError as e: helper_logger.log_error("Invalid value for port {} 'specification_compliance': {}".format(physical_port, media_compliance_dict_str)) - media_compliance_code = '' - media_type = '' - - if sup_compliance_str in media_compliance_dict: - media_compliance_code = media_compliance_dict[sup_compliance_str] - media_type = transceiver_dict[physical_port]['type_abbrv_name'] - media_key = '' - if len(media_type) != 0: media_key += media_type if len(media_compliance_code) != 0: media_key += '-' + media_compliance_code - if len(media_len) != 0: - media_key += '-' + media_len + 'M' + if _wrapper_get_sfp_type(physical_port) == 'QSFP_DD': + if media_compliance_code == "passive_copper_media_interface": + if len(media_len) != 0: + media_key += '-' + media_len + 'M' + else: + if len(media_len) != 0: + media_key += '-' + media_len + 'M' + else: + media_key += '-' + '*' return [vendor_key, media_key] @@ -742,8 +749,8 @@ def notify_media_setting(logical_port_name, transceiver_dict, key = get_media_settings_key(physical_port, transceiver_dict) media_dict = get_media_settings_value(physical_port, key) - if(len(media_dict) == 0): - helper_logger.log_error("Error in obtaining media setting") + if len(media_dict) == 0: + helper_logger.log_error("Error in obtaining media setting for {}".format(logical_port_name)) return fvs = swsscommon.FieldValuePairs(len(media_dict)) From cdabd09bad6249eaab5d1d21ee28d182473279ee Mon Sep 17 00:00:00 2001 From: vdahiya12 <67608553+vdahiya12@users.noreply.github.com> Date: Thu, 6 May 2021 11:32:34 -0700 Subject: [PATCH 02/17] [xcvrd] Change the y_cable presence logic to use "mux_cable" table as identifier from Config DB (#176) * [xcvrd] Change the y_cable presence logic to use "mux_cable" table as identfier from Config DB This PR changes the logic to use "mux_cable" table as identifier, and only those ports which have this table inside config DB and key pair of "state:auto/active" will be processed as having a Y-Cable port. The earlier logic was using the PORT table tag with a mux_cable:true key-value pair. Description change the logic to initiate Y-Cable logic Motivation and Context Required for changing the logic, so that this can be integrated with images which don't support "mux_cable" tag inside config DB Signed-off-by: vaibhav-dahiya --- .../xcvrd/xcvrd_utilities/y_cable_helper.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py index aacc264b3..aba08a769 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py @@ -282,9 +282,9 @@ def check_identifier_presence_and_update_mux_table_entry(state_db, port_tbl, y_c else: # Convert list of tuples to a dictionary mux_table_dict = dict(fvs) - if "mux_cable" in mux_table_dict: - val = mux_table_dict.get("mux_cable", None) - if val == "true": + if "state" in mux_table_dict: + val = mux_table_dict.get("state", None) + if val in ["active", "auto"]: y_cable_asic_table = y_cable_tbl.get(asic_index, None) mux_asic_table = mux_tbl.get(asic_index, None) @@ -314,6 +314,9 @@ def check_identifier_presence_and_update_mux_table_entry(state_db, port_tbl, y_c logical_port_name, y_cable_tbl[asic_index]) post_port_mux_info_to_db(logical_port_name, mux_tbl[asic_index]) post_port_mux_static_info_to_db(logical_port_name, static_tbl[asic_index]) + else: + helper_logger.log_warning( + "Could not retreive active or auto value for state kvp for {}, inside MUX_CABLE table".format(logical_port_name)) def check_identifier_presence_and_delete_mux_table_entry(state_db, port_tbl, asic_index, logical_port_name, y_cable_presence, delete_change_event): @@ -373,7 +376,7 @@ def init_ports_status_for_y_cable(platform_sfp, platform_chassis, y_cable_presen for namespace in namespaces: asic_id = multi_asic.get_asic_index_from_namespace(namespace) config_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) - port_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "PORT") + port_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "MUX_CABLE") port_table_keys[asic_id] = port_tbl[asic_id].getKeys() # Init PORT_STATUS table if ports are on Y cable @@ -414,7 +417,7 @@ def change_ports_status_for_y_cable_change_event(port_dict, y_cable_presence, st for namespace in namespaces: asic_id = multi_asic.get_asic_index_from_namespace(namespace) config_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) - port_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "PORT") + port_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "MUX_CABLE") port_table_keys[asic_id] = port_tbl[asic_id].getKeys() # Init PORT_STATUS table if ports are on Y cable and an event is received @@ -518,9 +521,9 @@ def check_identifier_presence_and_update_mux_info_entry(state_db, mux_tbl, asic_ else: # Convert list of tuples to a dictionary mux_table_dict = dict(fvs) - if "mux_cable" in mux_table_dict: - val = mux_table_dict.get("mux_cable", None) - if val == "true": + if "state" in mux_table_dict: + val = mux_table_dict.get("state", None) + if val in ["active", "auto"]: if mux_tbl.get(asic_index, None) is not None: # fill in the newly found entry @@ -535,6 +538,9 @@ def check_identifier_presence_and_update_mux_info_entry(state_db, mux_tbl, asic_ mux_tbl[asic_id] = swsscommon.Table(state_db[asic_id], MUX_CABLE_INFO_TABLE) # fill the newly found entry post_port_mux_info_to_db(logical_port_name, mux_tbl[asic_index]) + else: + helper_logger.log_warning( + "Could not retreive active or auto value for state kvp for {}, inside MUX_CABLE table".format(logical_port_name)) def get_firmware_dict(physical_port, target, side, mux_info_dict): From 665fcd90a698550c7f688b3c7fd0af78ccdbe74b Mon Sep 17 00:00:00 2001 From: Aravind Mani <53524901+aravindmani-1@users.noreply.github.com> Date: Mon, 10 May 2021 09:15:42 -0700 Subject: [PATCH 03/17] [xcvrd] Fix crash for QSFP DD media (#181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xcvrd crashes when pushing QSFP DD DOM info to state DB. #### Description **xcvrd crash log:** “May 6 05:12:37.446235 S1G2 ERR pmon#xcvrd[5292]: This functionality is currently not implemented for this platform” --- sonic-xcvrd/xcvrd/xcvrd.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 7b84b5794..873a1fa52 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -206,7 +206,7 @@ def _wrapper_get_sfp_type(physical_port): # Remove unnecessary unit from the raw data -def beautify_dom_info_dict(dom_info_dict): +def beautify_dom_info_dict(dom_info_dict, physical_port): dom_info_dict['temperature'] = strip_unit_and_beautify(dom_info_dict['temperature'], TEMP_UNIT) dom_info_dict['voltage'] = strip_unit_and_beautify(dom_info_dict['voltage'], VOLT_UNIT) dom_info_dict['rx1power'] = strip_unit_and_beautify(dom_info_dict['rx1power'], POWER_UNIT) @@ -221,6 +221,19 @@ def beautify_dom_info_dict(dom_info_dict): dom_info_dict['tx2power'] = strip_unit_and_beautify(dom_info_dict['tx2power'], POWER_UNIT) dom_info_dict['tx3power'] = strip_unit_and_beautify(dom_info_dict['tx3power'], POWER_UNIT) dom_info_dict['tx4power'] = strip_unit_and_beautify(dom_info_dict['tx4power'], POWER_UNIT) + if _wrapper_get_sfp_type(physical_port) == 'QSFP_DD': + dom_info_dict['rx5power'] = strip_unit_and_beautify(dom_info_dict['rx5power'], POWER_UNIT) + dom_info_dict['rx6power'] = strip_unit_and_beautify(dom_info_dict['rx6power'], POWER_UNIT) + dom_info_dict['rx7power'] = strip_unit_and_beautify(dom_info_dict['rx7power'], POWER_UNIT) + dom_info_dict['rx8power'] = strip_unit_and_beautify(dom_info_dict['rx8power'], POWER_UNIT) + dom_info_dict['tx5bias'] = strip_unit_and_beautify(dom_info_dict['tx5bias'], BIAS_UNIT) + dom_info_dict['tx6bias'] = strip_unit_and_beautify(dom_info_dict['tx6bias'], BIAS_UNIT) + dom_info_dict['tx7bias'] = strip_unit_and_beautify(dom_info_dict['tx7bias'], BIAS_UNIT) + dom_info_dict['tx8bias'] = strip_unit_and_beautify(dom_info_dict['tx8bias'], BIAS_UNIT) + dom_info_dict['tx5power'] = strip_unit_and_beautify(dom_info_dict['tx5power'], POWER_UNIT) + dom_info_dict['tx6power'] = strip_unit_and_beautify(dom_info_dict['tx6power'], POWER_UNIT) + dom_info_dict['tx7power'] = strip_unit_and_beautify(dom_info_dict['tx7power'], POWER_UNIT) + dom_info_dict['tx8power'] = strip_unit_and_beautify(dom_info_dict['tx8power'], POWER_UNIT) def beautify_dom_threshold_info_dict(dom_info_dict): @@ -399,7 +412,7 @@ def post_port_dom_info_to_db(logical_port_name, table, stop_event=threading.Even try: dom_info_dict = _wrapper_get_transceiver_dom_info(physical_port) if dom_info_dict is not None: - beautify_dom_info_dict(dom_info_dict) + beautify_dom_info_dict(dom_info_dict, physical_port) if _wrapper_get_sfp_type(physical_port) == 'QSFP_DD': fvs = swsscommon.FieldValuePairs( [('temperature', dom_info_dict['temperature']), From cc3803fef79707897fdaf269e65871c8d64b329f Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 11 May 2021 00:46:17 +0800 Subject: [PATCH 04/17] [thermalctld] Enable stopping thermal manager (#180) Make thermalctld exit as soon as possible. Depends on https://github.com/Azure/sonic-platform-common/pull/187 During config reload flow, supervisord sends SIGTERM to thermalctld. However, if thermalctld cannot exit in 10 seconds, supervisord sends SITKILL to thermalctld. In this case, sub process of thermalctld might still stay in memory as a zombie process. This PR is aimed to fix this. --- sonic-thermalctld/scripts/thermalctld | 28 +++++++++-- sonic-thermalctld/tests/test_thermalctld.py | 52 ++++++++++++--------- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 5c1449984..b98cd5c4a 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -191,7 +191,7 @@ class FanUpdater(logger.Logger): FAN_INFO_TABLE_NAME = 'FAN_INFO' FAN_DRAWER_INFO_TABLE_NAME = 'FAN_DRAWER_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer for FanUpdater :param chassis: Object representing a platform chassis @@ -199,6 +199,7 @@ class FanUpdater(logger.Logger): super(FanUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.fan_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, FanUpdater.FAN_INFO_TABLE_NAME) @@ -236,8 +237,12 @@ class FanUpdater(logger.Logger): FanStatus.reset_fan_counter() for drawer_index, drawer in enumerate(self.chassis.get_all_fan_drawers()): + if self.task_stopping_event.is_set(): + return self._refresh_fan_drawer_status(drawer, drawer_index) for fan_index, fan in enumerate(drawer.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(drawer, drawer_index, fan, fan_index) except Exception as e: @@ -245,6 +250,8 @@ class FanUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): for fan_index, fan in enumerate(psu.get_all_fans()): + if self.task_stopping_event.is_set(): + return try: self._refresh_fan_status(psu, psu_index, fan, fan_index, True) except Exception as e: @@ -396,6 +403,8 @@ class FanUpdater(logger.Logger): def _update_led_color(self): for fan_name, fan_status in self.fan_status_dict.items(): + if self.task_stopping_event.is_set(): + return try: fvs = swsscommon.FieldValuePairs([ ('led_status', str(try_get(fan_status.fan.get_status_led))) @@ -408,6 +417,8 @@ class FanUpdater(logger.Logger): self.table.set(fan_name, fvs) for drawer in self.chassis.get_all_fan_drawers(): + if self.task_stopping_event.is_set(): + return drawer_name = try_get(drawer.get_name) if drawer_name == NOT_AVAILABLE: continue @@ -510,7 +521,7 @@ class TemperatureUpdater(logger.Logger): # Temperature information table name in database TEMPER_INFO_TABLE_NAME = 'TEMPERATURE_INFO' - def __init__(self, chassis): + def __init__(self, chassis, task_stopping_event): """ Initializer of TemperatureUpdater :param chassis: Object representing a platform chassis @@ -518,6 +529,7 @@ class TemperatureUpdater(logger.Logger): super(TemperatureUpdater, self).__init__(SYSLOG_IDENTIFIER) self.chassis = chassis + self.task_stopping_event = task_stopping_event self.temperature_status_dict = {} state_db = daemon_base.db_connect("STATE_DB") self.table = swsscommon.Table(state_db, TemperatureUpdater.TEMPER_INFO_TABLE_NAME) @@ -562,6 +574,8 @@ class TemperatureUpdater(logger.Logger): """ self.log_debug("Start temperature updating") for index, thermal in enumerate(self.chassis.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index) except Exception as e: @@ -570,6 +584,8 @@ class TemperatureUpdater(logger.Logger): for psu_index, psu in enumerate(self.chassis.get_all_psus()): parent_name = 'PSU {}'.format(psu_index + 1) for thermal_index, thermal in enumerate(psu.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -578,6 +594,8 @@ class TemperatureUpdater(logger.Logger): for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()): parent_name = 'SFP {}'.format(sfp_index + 1) for thermal_index, thermal in enumerate(sfp.get_all_thermals()): + if self.task_stopping_event.is_set(): + return try: self._refresh_temperature_status(parent_name, thermal, thermal_index) except Exception as e: @@ -686,8 +704,8 @@ class ThermalMonitor(ProcessTaskBase): # Set minimum logging level to INFO self.logger.set_min_log_priority_info() - self.fan_updater = FanUpdater(chassis) - self.temperature_updater = TemperatureUpdater(chassis) + self.fan_updater = FanUpdater(chassis, self.task_stopping_event) + self.temperature_updater = TemperatureUpdater(chassis, self.task_stopping_event) def main(self): begin = time.time() @@ -789,6 +807,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase): if sig in FATAL_SIGNALS: self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig])) exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us + self.thermal_monitor.task_stop() + self.thermal_manager.stop() self.stop_event.set() elif sig in NONFATAL_SIGNALS: self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index d2f647384..7c272826b 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -1,5 +1,6 @@ import os import sys +import multiprocessing from imp import load_source # TODO: Replace with importlib once we no longer need to support Python 2 # TODO: Clean this up once we no longer need to support Python 2 @@ -148,7 +149,7 @@ class TestFanUpdater(object): Test cases to cover functionality in FanUpdater class """ def test_deinit(self): - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater.fan_status_dict = {'key1': 'value1', 'key2': 'value2'} fan_updater.table._del = mock.MagicMock() @@ -161,7 +162,7 @@ def test_deinit(self): @mock.patch('thermalctld.update_entity_info', mock.MagicMock()) def test_refresh_fan_drawer_status_fan_drawer_get_name_not_impl(self): # Test case where fan_drawer.get_name is not implemented - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) mock_fan_drawer = mock.MagicMock() fan_updater._refresh_fan_drawer_status(mock_fan_drawer, 1) assert thermalctld.update_entity_info.call_count == 0 @@ -175,7 +176,7 @@ def test_update_fan_with_exception(self): fan.make_over_speed() chassis.get_all_fans().append(fan) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan.get_status_led() == MockFan.STATUS_LED_COLOR_RED assert fan_updater.log_warning.call_count == 1 @@ -192,7 +193,7 @@ def test_set_fan_led_exception(self): mock_fan = MockFan() mock_fan.set_status_led = mock.MagicMock(side_effect=NotImplementedError) - fan_updater = thermalctld.FanUpdater(MockChassis()) + fan_updater = thermalctld.FanUpdater(MockChassis(), multiprocessing.Event()) fan_updater._set_fan_led(mock_fan_drawer, mock_fan, 'Test Fan', fan_status) assert fan_updater.log_warning.call_count == 1 fan_updater.log_warning.assert_called_with('Failed to set status LED for fan Test Fan, set_status_led not implemented') @@ -200,7 +201,7 @@ def test_set_fan_led_exception(self): def test_fan_absent(self): chassis = MockChassis() chassis.make_absent_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -224,7 +225,7 @@ def test_fan_absent(self): def test_fan_faulty(self): chassis = MockChassis() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -248,7 +249,7 @@ def test_fan_faulty(self): def test_fan_under_speed(self): chassis = MockChassis() chassis.make_under_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -264,7 +265,7 @@ def test_fan_under_speed(self): def test_fan_over_speed(self): chassis = MockChassis() chassis.make_over_speed_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED @@ -283,7 +284,7 @@ def test_update_psu_fans(self): mock_fan = MockFan() psu._fan_list.append(mock_fan) chassis._psu_list.append(psu) - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 0 @@ -331,7 +332,7 @@ def test_insufficient_fan_number(): chassis = MockChassis() chassis.make_absent_fan() chassis.make_faulty_fan() - fan_updater = thermalctld.FanUpdater(chassis) + fan_updater = thermalctld.FanUpdater(chassis, multiprocessing.Event()) fan_updater.update() assert fan_updater.log_warning.call_count == 3 expected_calls = [ @@ -415,7 +416,7 @@ class TestTemperatureUpdater(object): """ def test_deinit(self): chassis = MockChassis() - temp_updater = thermalctld.TemperatureUpdater(chassis) + temp_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temp_updater.temperature_status_dict = {'key1': 'value1', 'key2': 'value2'} temp_updater.table._del = mock.MagicMock() @@ -423,11 +424,12 @@ def test_deinit(self): assert temp_updater.table._del.call_count == 2 expected_calls = [mock.call('key1'), mock.call('key2')] temp_updater.table._del.assert_has_calls(expected_calls, any_order=True) + def test_over_temper(self): chassis = MockChassis() chassis.make_over_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -441,7 +443,7 @@ def test_over_temper(self): def test_under_temper(self): chassis = MockChassis() chassis.make_under_temper_thermal() - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() thermal_list = chassis.get_all_thermals() assert temperature_updater.log_warning.call_count == 1 @@ -458,7 +460,7 @@ def test_update_psu_thermals(self): mock_thermal = MockThermal() psu._thermal_list.append(mock_thermal) chassis._psu_list.append(psu) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -478,7 +480,7 @@ def test_update_sfp_thermals(self): mock_thermal = MockThermal() sfp._thermal_list.append(mock_thermal) chassis._sfp_list.append(sfp) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 0 @@ -499,7 +501,7 @@ def test_update_thermal_with_exception(self): thermal.make_over_temper() chassis.get_all_thermals().append(thermal) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.log_warning.call_count == 2 @@ -524,17 +526,17 @@ def test_updater_thermal_check_modular_chassis(): chassis = MockChassis() assert chassis.is_modular_chassis() == False - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None chassis.set_modular_chassis(True) chassis.set_my_slot(-1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table == None my_slot = 1 chassis.set_my_slot(my_slot) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) assert temperature_updater.chassis_table != None assert temperature_updater.chassis_table.table_name == '{}_{}'.format(TEMPER_INFO_TABLE_NAME, str(my_slot)) @@ -547,7 +549,7 @@ def test_updater_thermal_check_chassis_table(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() assert temperature_updater.chassis_table.get_size() == chassis.get_num_thermals() @@ -566,7 +568,7 @@ def test_updater_thermal_check_min_max(): chassis.set_modular_chassis(True) chassis.set_my_slot(1) - temperature_updater = thermalctld.TemperatureUpdater(chassis) + temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event()) temperature_updater.update() slot_dict = temperature_updater.chassis_table.get(thermal.get_name()) @@ -580,12 +582,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGHUP, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_info.call_count == 1 daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN # Test SIGINT @@ -593,6 +597,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGINT daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -600,6 +605,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test SIGTERM @@ -608,6 +614,7 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() test_signal = thermalctld.signal.SIGTERM daemon_thermalctld.signal_handler(test_signal, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert @@ -615,6 +622,7 @@ def test_signal_handler(): daemon_thermalctld.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...") assert daemon_thermalctld.log_warning.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 1 + assert daemon_thermalctld.thermal_manager.stop.call_count == 1 assert thermalctld.exit_code == (128 + test_signal) # Test an unhandled signal @@ -623,12 +631,14 @@ def test_signal_handler(): daemon_thermalctld.stop_event.set = mock.MagicMock() daemon_thermalctld.log_info = mock.MagicMock() daemon_thermalctld.log_warning = mock.MagicMock() + daemon_thermalctld.thermal_manager.stop = mock.MagicMock() daemon_thermalctld.signal_handler(thermalctld.signal.SIGUSR1, None) daemon_thermalctld.deinit() # Deinit becuase the test will hang if we assert assert daemon_thermalctld.log_warning.call_count == 1 daemon_thermalctld.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...") assert daemon_thermalctld.log_info.call_count == 0 assert daemon_thermalctld.stop_event.set.call_count == 0 + assert daemon_thermalctld.thermal_manager.stop.call_count == 0 assert thermalctld.exit_code == thermalctld.ERR_UNKNOWN From d0be6342d8001842107aacb836dd7c1661e7eee6 Mon Sep 17 00:00:00 2001 From: Tamer Ahmed Date: Mon, 10 May 2021 19:24:16 -0700 Subject: [PATCH 05/17] [muxcable] Remove Xcvrd Sleep (#174) When using redis select, xcvrd is sleeping for 100 msec which appears unnecessary after process all content of selectable objects. signed-off-by: Tamer Ahmed --- sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py index aba08a769..3d0e2294a 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py @@ -4,7 +4,6 @@ """ import threading -import time from sonic_py_common import daemon_base, logger from sonic_py_common import multi_asic @@ -1038,11 +1037,6 @@ def task_worker(self): # Use timeout to prevent ignoring the signals we want to handle # in signal_handler() (e.g. SIGTERM for graceful shutdown) - # A brief sleep appears necessary in this loop or any spawned - # update threads will get stuck. Appears to be due to the sel.select() call. - # TODO: Eliminate the need for this sleep. - time.sleep(0.1) - (state, selectableObj) = sel.select(SELECT_TIMEOUT) if state == swsscommon.Select.TIMEOUT: From 807b304d66a4e88e8091f430665a57b7a069020c Mon Sep 17 00:00:00 2001 From: Alexander Allen Date: Wed, 12 May 2021 15:51:42 -0400 Subject: [PATCH 06/17] [psud] Add PSU Hardware Revision to Redis STATE_DB (#179) Added "hardware revision" field to list of platform fields to sync to STATE_DB for the PSU. Also updated relevant unit tests. Now that hardware revision exists as a platform 2.0 field for all devices, it is appropriate to synchronize this field to STATE_DB for PSUs as is done with all other fields. This will allow this field to be exposed to CLI tools through psushow in the future which reads state from STATE_DB. --- sonic-psud/scripts/psud | 2 ++ sonic-psud/tests/mock_platform.py | 5 +++++ sonic-psud/tests/test_DaemonPsud.py | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sonic-psud/scripts/psud b/sonic-psud/scripts/psud index 8a41fc054..6be6f2934 100644 --- a/sonic-psud/scripts/psud +++ b/sonic-psud/scripts/psud @@ -48,6 +48,7 @@ PSU_INFO_KEY_TEMPLATE = 'PSU {}' PSU_INFO_PRESENCE_FIELD = 'presence' PSU_INFO_MODEL_FIELD = 'model' PSU_INFO_SERIAL_FIELD = 'serial' +PSU_INFO_REV_FIELD = 'revision' PSU_INFO_STATUS_FIELD = 'status' PSU_INFO_TEMP_FIELD = 'temp' PSU_INFO_TEMP_TH_FIELD = 'temp_threshold' @@ -510,6 +511,7 @@ class DaemonPsud(daemon_base.DaemonBase): fvs = swsscommon.FieldValuePairs( [(PSU_INFO_MODEL_FIELD, str(try_get(psu.get_model, NOT_AVAILABLE))), (PSU_INFO_SERIAL_FIELD, str(try_get(psu.get_serial, NOT_AVAILABLE))), + (PSU_INFO_REV_FIELD, str(try_get(psu.get_revision, NOT_AVAILABLE))), (PSU_INFO_TEMP_FIELD, str(temperature)), (PSU_INFO_TEMP_TH_FIELD, str(temperature_threshold)), (PSU_INFO_VOLTAGE_FIELD, str(voltage)), diff --git a/sonic-psud/tests/mock_platform.py b/sonic-psud/tests/mock_platform.py index 6a0bb3517..8d3b13c52 100644 --- a/sonic-psud/tests/mock_platform.py +++ b/sonic-psud/tests/mock_platform.py @@ -273,6 +273,7 @@ def __init__(self, presence=True, model='Module Model', serial='Module Serial', + revision='Module Revision', status=True, voltage=12.0, current=8.0, @@ -287,6 +288,7 @@ def __init__(self, self._presence = presence self._model = model self._serial = serial + self._revision = revision self._status = status self._position_in_parent = position_in_parent self._replaceable = replaceable @@ -365,6 +367,9 @@ def get_model(self): def get_serial(self): return self._serial + def get_revision(self): + return self._revision + def get_status(self): return self._status diff --git a/sonic-psud/tests/test_DaemonPsud.py b/sonic-psud/tests/test_DaemonPsud.py index 116886657..0234ec79f 100644 --- a/sonic-psud/tests/test_DaemonPsud.py +++ b/sonic-psud/tests/test_DaemonPsud.py @@ -148,13 +148,14 @@ def test_update_single_psu_data(self): psud._wrapper_get_psu_presence.return_value = True psud._wrapper_get_psu_status.return_value = True - psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678') + psu1 = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234') psud.platform_chassis = MockChassis() psud.platform_chassis._psu_list.append(psu1) expected_fvp = psud.swsscommon.FieldValuePairs( [(psud.PSU_INFO_MODEL_FIELD, 'Fake Model'), (psud.PSU_INFO_SERIAL_FIELD, '12345678'), + (psud.PSU_INFO_REV_FIELD, '1234'), (psud.PSU_INFO_TEMP_FIELD, '30.0'), (psud.PSU_INFO_TEMP_TH_FIELD, '50.0'), (psud.PSU_INFO_VOLTAGE_FIELD, '12.0'), From e60804cd823090291ad1032e2ef3e9edb11d5a65 Mon Sep 17 00:00:00 2001 From: vdahiya12 <67608553+vdahiya12@users.noreply.github.com> Date: Mon, 17 May 2021 11:20:59 -0700 Subject: [PATCH 07/17] [xcvrd] add support for logging mux_metrics events into state DB (#185) * [xcvrd] add support for logging mux_metrics events into state DB Description This PR adds support for logging events for change requests received by xcvrd from swss into state DB. a typical log would look like this: 1) "xcvrd_switch_standby_start" 2) "2021-05-13 10:01:15.690835" 3) "xcvrd_switch_standby_end" 4) "2021-05-13 10:01:15.696051" where the key-value pairs signify the type of event requested out of active/standby/unknown and the timestamp associated with the event. Motivation and Context This is required for xcvrd to log the events which it receives in form of requests from swss or any other module into the DB. The timestamp will be useful for debugging the timeline of an event processing from a perspective of other modules as well as xcvrd itself. How Has This Been Tested? ran the changes on starlab testbed, by changing the file in the container. Signed-off-by: vaibhav-dahiya --- sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py index 3d0e2294a..ac2fc4e0d 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py @@ -3,6 +3,7 @@ helper utlities configuring y_cable for xcvrd daemon """ +import datetime import threading from sonic_py_common import daemon_base, logger @@ -1010,6 +1011,7 @@ def task_worker(self): appl_db, state_db, status_tbl, y_cable_tbl = {}, {}, {}, {} y_cable_tbl_keys = {} mux_cable_command_tbl, y_cable_command_tbl = {}, {} + mux_metrics_tbl = {} sel = swsscommon.Select() @@ -1028,6 +1030,8 @@ def task_worker(self): state_db[asic_id] = daemon_base.db_connect("STATE_DB", namespace) y_cable_tbl[asic_id] = swsscommon.Table( state_db[asic_id], swsscommon.STATE_HW_MUX_CABLE_TABLE_NAME) + mux_metrics_tbl[asic_id] = swsscommon.Table( + state_db[asic_id], swsscommon.STATE_MUX_METRICS_TABLE_NAME) y_cable_tbl_keys[asic_id] = y_cable_tbl[asic_id].getKeys() sel.addSelectable(status_tbl[asic_id]) sel.addSelectable(mux_cable_command_tbl[asic_id]) @@ -1058,6 +1062,10 @@ def task_worker(self): (port, op, fvp) = status_tbl[asic_index].pop() if not port: break + + # entering this section signifies a start for xcvrd state + # change request from swss so initiate recording in mux_metrics table + time_start = datetime.datetime.utcnow().strftime("%Y-%b-%d %H:%M:%S.%f") if fvp: # This check might be redundant, to check, the presence of this Port in keys # in logical_port_list but keep for now for coherency @@ -1091,6 +1099,10 @@ def task_worker(self): y_cable_tbl[asic_index].set(port, fvs_updated) helper_logger.log_info("Got a change event for toggle the mux-direction active side for port {} state from {} to {}".format( port, old_status, new_status)) + time_end = datetime.datetime.utcnow().strftime("%Y-%b-%d %H:%M:%S.%f") + fvs_metrics = swsscommon.FieldValuePairs([('xcvrd_switch_{}_start'.format(new_status), str(time_start)), + ('xcvrd_switch_{}_end'.format(new_status), str(time_end))]) + mux_metrics_tbl[asic_index].set(port, fvs_metrics) else: helper_logger.log_info("Got a change event on port {} of table {} that does not contain state".format( port, swsscommon.APP_HW_MUX_CABLE_TABLE_NAME)) From 1adf47b20c86b37a60c64a25ecacdc8eb49c3903 Mon Sep 17 00:00:00 2001 From: Alexander Allen Date: Tue, 18 May 2021 12:18:51 -0400 Subject: [PATCH 08/17] [chassisd] Add script to initialize chassis info in STATE_DB (#183) #### Description I added a new script `chassis_db_init` that uploads chassis hardware information such as serial number, model number and hardware revision to the STATE_DB under the CHASSIS_INFO table. #### Motivation and Context I made this change in order to expose the chassis hardware information to SONiC user space and allow CLI utilities to access it in order to expose it to the user. --- sonic-chassisd/scripts/chassis_db_init | 105 +++++++++++++++++++ sonic-chassisd/setup.py | 1 + sonic-chassisd/tests/mock_platform.py | 9 ++ sonic-chassisd/tests/test_chassis_db_init.py | 38 +++++++ 4 files changed, 153 insertions(+) create mode 100644 sonic-chassisd/scripts/chassis_db_init create mode 100644 sonic-chassisd/tests/test_chassis_db_init.py diff --git a/sonic-chassisd/scripts/chassis_db_init b/sonic-chassisd/scripts/chassis_db_init new file mode 100644 index 000000000..b1aafb881 --- /dev/null +++ b/sonic-chassisd/scripts/chassis_db_init @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +""" + chassis_db_init + Chassis information update tool for SONiC + This tool runs one time at the launch of the platform monitor in order to populate STATE_DB with chassis information such as model, serial number, and revision. +""" + +try: + import os + import sys + + from sonic_py_common import daemon_base, logger + + # If unit testing is occurring, mock swsscommon and module_base + if os.getenv("CHASSIS_DB_INIT_UNIT_TESTING") == "1": + from tests import mock_swsscommon as swsscommon + else: + from swsscommon import swsscommon +except ImportError as e: + raise ImportError(str(e) + " - required module not found") + +# +# Constants +# + +SYSLOG_IDENTIFIER = "chassis_db_init" + +CHASSIS_INFO_TABLE = 'CHASSIS_INFO' +CHASSIS_INFO_KEY_TEMPLATE = 'chassis {}' +CHASSIS_INFO_CARD_NUM_FIELD = 'module_num' +CHASSIS_INFO_SERIAL_FIELD = 'serial' +CHASSIS_INFO_MODEL_FIELD = 'model' +CHASSIS_INFO_REV_FIELD = 'revision' + +CHASSIS_LOAD_ERROR = 1 + +NOT_AVAILABLE = 'N/A' + +# +# Helper functions ============================================================= +# + +# try get information from platform API and return a default value if caught NotImplementedError + + +def try_get(callback, *args, **kwargs): + """ + Handy function to invoke the callback and catch NotImplementedError + :param callback: Callback to be invoked + :param args: Arguments to be passed to callback + :param kwargs: Default return value if exception occur + :return: Default return value if exception occur else return value of the callback + """ + default = kwargs.get('default', NOT_AVAILABLE) + try: + ret = callback(*args) + if ret is None: + ret = default + except NotImplementedError: + ret = default + + return ret + +# +# Functions +# + +def provision_db(platform_chassis, log): + # Init state db connection + state_db = daemon_base.db_connect("STATE_DB") + chassis_table = swsscommon.Table(state_db, CHASSIS_INFO_TABLE) + + # Populate DB with chassis hardware info + fvs = swsscommon.FieldValuePairs([ + (CHASSIS_INFO_SERIAL_FIELD, try_get(platform_chassis.get_serial)), + (CHASSIS_INFO_MODEL_FIELD, try_get(platform_chassis.get_model)), + (CHASSIS_INFO_REV_FIELD, try_get(platform_chassis.get_revision)) + ]) + chassis_table.set(CHASSIS_INFO_KEY_TEMPLATE.format(1), fvs) + log.log_info("STATE_DB provisioned with chassis info.") + + return chassis_table + + +# +# Main +# + +def main(): + log = logger.Logger(SYSLOG_IDENTIFIER) + log.log_info("Provisioning Database with Chassis Info...") + + # Load platform api class + try: + import sonic_platform.platform + platform_chassis = sonic_platform.platform.Platform().get_chassis() + except Exception as e: + log.log_error("Failed to load chassis due to {}".format(repr(e))) + sys.exit(CHASSIS_LOAD_ERROR) + + provision_db(platform_chassis, log) + +if __name__ == '__main__': + main() diff --git a/sonic-chassisd/setup.py b/sonic-chassisd/setup.py index 271c7a794..6fcfb51f3 100644 --- a/sonic-chassisd/setup.py +++ b/sonic-chassisd/setup.py @@ -15,6 +15,7 @@ ], scripts=[ 'scripts/chassisd', + 'scripts/chassis_db_init' ], setup_requires=[ 'pytest-runner', diff --git a/sonic-chassisd/tests/mock_platform.py b/sonic-chassisd/tests/mock_platform.py index c878fafd6..f13356c52 100644 --- a/sonic-chassisd/tests/mock_platform.py +++ b/sonic-chassisd/tests/mock_platform.py @@ -92,3 +92,12 @@ def get_module_index(self, module_name): def init_midplane_switch(self): return True + + def get_serial(self): + return "Serial No" + + def get_model(self): + return "Model A" + + def get_revision(self): + return "Rev C" diff --git a/sonic-chassisd/tests/test_chassis_db_init.py b/sonic-chassisd/tests/test_chassis_db_init.py new file mode 100644 index 000000000..e9a9560ba --- /dev/null +++ b/sonic-chassisd/tests/test_chassis_db_init.py @@ -0,0 +1,38 @@ +import os +import sys +from imp import load_source + +from mock import Mock, MagicMock, patch +from sonic_py_common import daemon_base + +from .mock_platform import MockChassis, MockModule +from .mock_module_base import ModuleBase + +SYSLOG_IDENTIFIER = 'chassis_db_init_test' +NOT_AVAILABLE = 'N/A' + +daemon_base.db_connect = MagicMock() + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +scripts_path = os.path.join(modules_path, "scripts") +sys.path.insert(0, modules_path) + +os.environ["CHASSIS_DB_INIT_UNIT_TESTING"] = "1" +load_source('chassis_db_init', scripts_path + '/chassis_db_init') +from chassis_db_init import * + + +def test_provision_db(): + chassis = MockChassis() + log = MagicMock() + serial = "Serial No" + model = "Model A" + revision = "Rev C" + + chassis_table = provision_db(chassis, log) + + fvs = chassis_table.get(CHASSIS_INFO_KEY_TEMPLATE.format(1)) + assert serial == fvs[CHASSIS_INFO_SERIAL_FIELD] + assert model == fvs[CHASSIS_INFO_MODEL_FIELD] + assert revision == fvs[CHASSIS_INFO_REV_FIELD] From 9ba52a2ec66a2a2b5e24d6553f0f3af798b0018c Mon Sep 17 00:00:00 2001 From: Qi Luo Date: Fri, 21 May 2021 15:24:55 -0700 Subject: [PATCH 09/17] Mock path early so it will applied to sonic_py_common, mock more swsscommon classes (#187) #### Description Existing test will mix true swsscommon package with the mocked one in sonic_py_common #### Motivation and Context This is blocking https://github.com/Azure/sonic-buildimage/pull/7655 #### How Has This Been Tested? Unit test --- .../tests/mocked_libs/swsscommon/swsscommon.py | 2 ++ sonic-thermalctld/tests/test_thermalctld.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/sonic-thermalctld/tests/mocked_libs/swsscommon/swsscommon.py b/sonic-thermalctld/tests/mocked_libs/swsscommon/swsscommon.py index 6947a8601..13c49dec1 100644 --- a/sonic-thermalctld/tests/mocked_libs/swsscommon/swsscommon.py +++ b/sonic-thermalctld/tests/mocked_libs/swsscommon/swsscommon.py @@ -2,6 +2,8 @@ Mock implementation of swsscommon package for unit testing ''' +from swsssdk import ConfigDBConnector, SonicDBConfig, SonicV2Connector + STATE_DB = '' diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index 7c272826b..063e11c22 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -10,18 +10,24 @@ import mock import pytest -from sonic_py_common import daemon_base - -from .mock_platform import MockChassis, MockFan, MockPsu, MockSfp, MockThermal - -daemon_base.db_connect = mock.MagicMock() - tests_path = os.path.dirname(os.path.abspath(__file__)) # Add mocked_libs path so that the file under test can load mocked modules from there mocked_libs_path = os.path.join(tests_path, 'mocked_libs') sys.path.insert(0, mocked_libs_path) + +import swsscommon +# Check we are using the mocked package +assert len(swsscommon.__path__) == 1 +assert(os.path.samefile(swsscommon.__path__[0], os.path.join(mocked_libs_path, 'swsscommon'))) + +from sonic_py_common import daemon_base + +from .mock_platform import MockChassis, MockFan, MockPsu, MockSfp, MockThermal + +daemon_base.db_connect = mock.MagicMock() + # Add path to the file under test so that we can load it modules_path = os.path.dirname(tests_path) scripts_path = os.path.join(modules_path, 'scripts') From 9297a294b5e6cde0cbfa05d51b185070476dd6c9 Mon Sep 17 00:00:00 2001 From: Qi Luo Date: Mon, 24 May 2021 08:42:49 -0700 Subject: [PATCH 10/17] Mock path early so it will applied to sonic_py_common, mock platform_chassis (#188) #### Description Following https://github.com/Azure/sonic-platform-daemons/pull/187 to fix the same issue with other packages #### Motivation and Context This is blocking Azure/sonic-buildimage#7655 #### How Has This Been Tested? Unit test --- sonic-psud/tests/mocked_libs/swsscommon/swsscommon.py | 2 ++ sonic-psud/tests/test_DaemonPsud.py | 5 +++-- sonic-psud/tests/test_psud.py | 3 ++- sonic-syseepromd/tests/mocked_libs/swsscommon/swsscommon.py | 5 +++++ sonic-syseepromd/tests/test_syseepromd.py | 6 +++--- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/sonic-psud/tests/mocked_libs/swsscommon/swsscommon.py b/sonic-psud/tests/mocked_libs/swsscommon/swsscommon.py index 6947a8601..13c49dec1 100644 --- a/sonic-psud/tests/mocked_libs/swsscommon/swsscommon.py +++ b/sonic-psud/tests/mocked_libs/swsscommon/swsscommon.py @@ -2,6 +2,8 @@ Mock implementation of swsscommon package for unit testing ''' +from swsssdk import ConfigDBConnector, SonicDBConfig, SonicV2Connector + STATE_DB = '' diff --git a/sonic-psud/tests/test_DaemonPsud.py b/sonic-psud/tests/test_DaemonPsud.py index 0234ec79f..06033f36e 100644 --- a/sonic-psud/tests/test_DaemonPsud.py +++ b/sonic-psud/tests/test_DaemonPsud.py @@ -10,14 +10,12 @@ from unittest import mock else: import mock -from sonic_py_common import daemon_base from .mock_platform import MockChassis, MockFan, MockPsu SYSLOG_IDENTIFIER = 'psud_test' NOT_AVAILABLE = 'N/A' -daemon_base.db_connect = mock.MagicMock() tests_path = os.path.dirname(os.path.abspath(__file__)) @@ -25,6 +23,9 @@ mocked_libs_path = os.path.join(tests_path, "mocked_libs") sys.path.insert(0, mocked_libs_path) +from sonic_py_common import daemon_base +daemon_base.db_connect = mock.MagicMock() + # Add path to the file under test so that we can load it modules_path = os.path.dirname(tests_path) scripts_path = os.path.join(modules_path, "scripts") diff --git a/sonic-psud/tests/test_psud.py b/sonic-psud/tests/test_psud.py index a66a8c29a..405ec30c6 100644 --- a/sonic-psud/tests/test_psud.py +++ b/sonic-psud/tests/test_psud.py @@ -11,7 +11,7 @@ import mock from sonic_py_common import daemon_base -from .mock_platform import MockPsu +from .mock_platform import MockPsu, MockChassis tests_path = os.path.dirname(os.path.abspath(__file__)) @@ -179,6 +179,7 @@ def test_log_on_status_changed(): mock_logger.log_warning.assert_called_with(abnormal_log) +@mock.patch('psud.platform_chassis', mock.MagicMock()) @mock.patch('psud.DaemonPsud.run') def test_main(mock_run): mock_run.return_value = False diff --git a/sonic-syseepromd/tests/mocked_libs/swsscommon/swsscommon.py b/sonic-syseepromd/tests/mocked_libs/swsscommon/swsscommon.py index 8a0a87692..13c49dec1 100644 --- a/sonic-syseepromd/tests/mocked_libs/swsscommon/swsscommon.py +++ b/sonic-syseepromd/tests/mocked_libs/swsscommon/swsscommon.py @@ -2,6 +2,8 @@ Mock implementation of swsscommon package for unit testing ''' +from swsssdk import ConfigDBConnector, SonicDBConfig, SonicV2Connector + STATE_DB = '' @@ -23,6 +25,9 @@ def get(self, key): return self.mock_dict[key] return None + def get_size(self): + return (len(self.mock_dict)) + class FieldValuePairs: fv_dict = {} diff --git a/sonic-syseepromd/tests/test_syseepromd.py b/sonic-syseepromd/tests/test_syseepromd.py index e25b94ce3..4ac106b8a 100644 --- a/sonic-syseepromd/tests/test_syseepromd.py +++ b/sonic-syseepromd/tests/test_syseepromd.py @@ -9,19 +9,19 @@ from unittest import mock else: import mock -from sonic_py_common import daemon_base SYSLOG_IDENTIFIER = 'syseepromd_test' NOT_AVAILABLE = 'N/A' -daemon_base.db_connect = mock.MagicMock() - tests_path = os.path.dirname(os.path.abspath(__file__)) # Add mocked_libs path so that the file under test can load mocked modules from there mocked_libs_path = os.path.join(tests_path, 'mocked_libs') sys.path.insert(0, mocked_libs_path) +from sonic_py_common import daemon_base +daemon_base.db_connect = mock.MagicMock() + # Add path to the file under test so that we can load it modules_path = os.path.dirname(tests_path) scripts_path = os.path.join(modules_path, 'scripts') From bf60a27847df44612daa7776fe61c5c64722317b Mon Sep 17 00:00:00 2001 From: Qi Luo Date: Wed, 2 Jun 2021 16:34:28 -0700 Subject: [PATCH 11/17] Replace swsssdk.SonicV2Connector with swsscommon implementation (#191) swsssdk will be deprecated, and the python version of redis accessing classes will be replace by the same name classes in swsscommon, which are SWIG wrapped C++ code. --- sonic-pcied/scripts/pcied | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index d69ddcb92..0f2a5208c 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -11,7 +11,6 @@ try: import sys import threading - import swsssdk from sonic_py_common import daemon_base, device_info from swsscommon import swsscommon except ImportError as e: @@ -49,7 +48,7 @@ class DaemonPcied(daemon_base.DaemonBase): self.timeout = PCIED_MAIN_THREAD_SLEEP_SECS self.stop_event = threading.Event() - self.state_db = swsssdk.SonicV2Connector(host=REDIS_HOSTIP) + self.state_db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP) self.state_db.connect("STATE_DB") state_db = daemon_base.db_connect("STATE_DB") self.device_table = swsscommon.Table(state_db, PCIE_DEVICE_TABLE_NAME) From a6903c029cb2a3f25e0682424c08811d05bc6bb2 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Wed, 16 Jun 2021 09:05:26 -0700 Subject: [PATCH 12/17] [CI] sonic-config-engine now depends on SONiC YANG packages (#194) The Python 3 version of sonic-config-engine now depends on sonic-yang-mgmt and sonic-yang-models, so we now need to install them as part of continuous integration to get the CI working again --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cd7d1f3cd..2439aff16 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -85,6 +85,8 @@ jobs: sudo pip2 install sonic_platform_common-1.0-py2-none-any.whl sudo pip3 install swsssdk-2.0.1-py3-none-any.whl sudo pip3 install sonic_py_common-1.0-py3-none-any.whl + sudo pip3 install sonic_yang_mgmt-1.0-py3-none-any.whl + sudo pip3 install sonic_yang_models-1.0-py3-none-any.whl sudo pip3 install sonic_config_engine-1.0-py3-none-any.whl sudo pip3 install sonic_platform_common-1.0-py3-none-any.whl workingDirectory: $(Pipeline.Workspace)/target/python-wheels/ From eb8a22337b1522ec10e917561f40e0835ecd5c1f Mon Sep 17 00:00:00 2001 From: Alexander Allen Date: Wed, 16 Jun 2021 12:34:33 -0400 Subject: [PATCH 13/17] [xcvrd] Force cleanup of chassis global variable on deinit (#193) - Description I added a del directive for the global platform_chassis object in xcvrd on deinit to ensure that the pointer is properly cleaned up such that the __del__() method is successfully called on the chassis object which closes its connection to SAI. - Motivation and Context On Mellanox platforms on 202012 we are seeing that in some cases xcvrd is maintaining the socket to the API past its deconstruction causing an error from SAI. This was traced by reproducing the issue and identifying that the pmon container was causing this issue where xcvrd is the only daemon within pmon that opens a SDK socket on Mellanox. Error in log... syncd#SDK: [SX_API_INTERNAL.ERR] Failed command read at communication channel: Connection reset by peer - How Has This Been Tested? Change was uploaded to xcvrd on running pmon container on a switch running the latest 202012 build. xcvrd was then restarted with supervisord and we verified that we could no longer reproduce the bug. --- sonic-xcvrd/xcvrd/xcvrd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index 873a1fa52..e17237f45 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -1353,6 +1353,8 @@ def deinit(self): if self.y_cable_presence[0] is True: y_cable_helper.delete_ports_status_for_y_cable() + del globals()['platform_chassis'] + # Run daemon def run(self): From 2fc05b21536ac95ac08cbb36822232239923e08a Mon Sep 17 00:00:00 2001 From: Sujin Kang Date: Thu, 17 Jun 2021 14:03:37 -0700 Subject: [PATCH 14/17] Refactor Pcied and add unittest (#189) Description Refactor the pcied and add the unit test Motivation and Context Added unit test to increase the pmon unit test coverage. How Has This Been Tested? Build with unit test enabled and run manually on a dut to verify the pcied. --- sonic-pcied/pytest.ini | 2 + sonic-pcied/scripts/pcied | 265 ++++++++++-------- sonic-pcied/setup.cfg | 2 + sonic-pcied/setup.py | 12 + sonic-pcied/tests/__init__.py | 0 sonic-pcied/tests/mock_platform.py | 48 ++++ .../tests/mocked_libs/swsscommon/__init__.py | 5 + .../mocked_libs/swsscommon/swsscommon.py | 54 ++++ sonic-pcied/tests/test_DaemonPcied.py | 226 +++++++++++++++ sonic-pcied/tests/test_pcied.py | 43 +++ 10 files changed, 542 insertions(+), 115 deletions(-) create mode 100644 sonic-pcied/pytest.ini create mode 100644 sonic-pcied/setup.cfg create mode 100644 sonic-pcied/tests/__init__.py create mode 100644 sonic-pcied/tests/mock_platform.py create mode 100644 sonic-pcied/tests/mocked_libs/swsscommon/__init__.py create mode 100644 sonic-pcied/tests/mocked_libs/swsscommon/swsscommon.py create mode 100644 sonic-pcied/tests/test_DaemonPcied.py create mode 100644 sonic-pcied/tests/test_pcied.py diff --git a/sonic-pcied/pytest.ini b/sonic-pcied/pytest.ini new file mode 100644 index 000000000..d90ee9ed9 --- /dev/null +++ b/sonic-pcied/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --cov=scripts --cov-report html --cov-report term --cov-report xml --junitxml=test-results.xml -vv diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index 0f2a5208c..7265063ff 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -5,30 +5,62 @@ PCIe device monitoring daemon for SONiC """ -try: - import os - import signal - import sys - import threading +import os +import signal +import sys +import threading - from sonic_py_common import daemon_base, device_info - from swsscommon import swsscommon -except ImportError as e: - raise ImportError(str(e) + " - required module not found") +from sonic_py_common import daemon_base, device_info +from swsscommon import swsscommon # # Constants ==================================================================== # + +# TODO: Once we no longer support Python 2, we can eliminate this and get the +# name using the 'name' field (e.g., `signal.SIGINT.name`) starting with Python 3.5 +SIGNALS_TO_NAMES_DICT = dict((getattr(signal, n), n) + for n in dir(signal) if n.startswith('SIG') and '_' not in n) + SYSLOG_IDENTIFIER = "pcied" PCIE_RESULT_REGEX = "PCIe Device Checking All Test" -PCIE_TABLE_NAME = "PCIE_STATUS" PCIE_DEVICE_TABLE_NAME = "PCIE_DEVICE" - -PCIE_CONF_FILE = 'pcie.yaml' +PCIE_STATUS_TABLE_NAME = "PCIE_DEVICES" PCIED_MAIN_THREAD_SLEEP_SECS = 60 -REDIS_HOSTIP = "127.0.0.1" + +PCIEUTIL_CONF_FILE_ERROR = 1 +PCIEUTIL_LOAD_ERROR = 2 + +platform_pcieutil = None + +exit_code = 0 + +# wrapper functions to call the platform api +def load_platform_pcieutil(): + _platform_pcieutil = None + (platform_path, _) = device_info.get_paths_to_platform_and_hwsku_dirs() + try: + from sonic_platform.pcie import Pcie + _platform_pcieutil = Pcie(platform_path) + except ImportError as e: + self.log_error("Failed to load platform Pcie module. Error : {}".format(str(e)), True) + try: + from sonic_platform_base.sonic_pcie.pcie_common import PcieUtil + _platform_pcieutil = PcieUtil(platform_path) + except ImportError as e: + self.log_error("Failed to load default PcieUtil module. Error : {}".format(str(e)), True) + return _platform_pcieutil + +def read_id_file(device_name): + id = None + dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name + + if os.path.exists(dev_id_path): + with open(dev_id_path, 'r') as fd: + id = fd.read().strip() + return id # # Daemon ======================================================================= @@ -39,134 +71,129 @@ class DaemonPcied(daemon_base.DaemonBase): def __init__(self, log_identifier): super(DaemonPcied, self).__init__(log_identifier) - (platform_path, _) = device_info.get_paths_to_platform_and_hwsku_dirs() - pciefilePath = os.path.join(platform_path, PCIE_CONF_FILE) - if not os.path.exists(pciefilePath): - self.log_error("Platform pcie configuration file doesn't exist! Exiting ...") - sys.exit("Platform PCIe Configuration file doesn't exist!") - self.timeout = PCIED_MAIN_THREAD_SLEEP_SECS self.stop_event = threading.Event() - - self.state_db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP) - self.state_db.connect("STATE_DB") - state_db = daemon_base.db_connect("STATE_DB") - self.device_table = swsscommon.Table(state_db, PCIE_DEVICE_TABLE_NAME) - - # Load AER-fields into STATEDB - def update_aer_to_statedb(self, device_name, aer_stats): + self.state_db = None + self.device_table = None + self.table = None + self.resultInfo = [] + self.device_name = None + self.aer_stats = {} + + global platform_pcieutil + + platform_pcieutil = load_platform_pcieutil() + if platform_pcieutil is None: + sys.exit(PCIEUTIL_LOAD_ERROR) + + # Connect to STATE_DB and create pcie device table + self.state_db = daemon_base.db_connect("STATE_DB") + self.device_table = swsscommon.Table(self.state_db, PCIE_DEVICE_TABLE_NAME) + self.status_table = swsscommon.Table(self.state_db, PCIE_STATUS_TABLE_NAME) + + def __del__(self): + if self.device_table: + table_keys = self.device_table.getKeys() + for tk in table_keys: + self.device_table._del(tk) + if self.status_table: + stable_keys = self.status_table.getKeys() + for stk in stable_keys: + self.status_table._del(stk) + + # load aer-fields into statedb + def update_aer_to_statedb(self): + if self.aer_stats is None: + self.log_debug("PCIe device {} has no AER Stats".format(device_name)) + return aer_fields = {} - for field, value in aer_stats['correctable'].items(): - correctable_field = "correctable|" + field - aer_fields[correctable_field] = value - - for field, value in aer_stats['fatal'].items(): - fatal_field = "fatal|" + field - aer_fields[fatal_field] = value - - for field, value in aer_stats['non_fatal'].items(): - non_fatal_field = "non_fatal|" + field - aer_fields[non_fatal_field] = value + for key, fv in self.aer_stats.items(): + for field, value in fv.items(): + key_field = "{}|{}".format(key,field) + aer_fields[key_field] = value if aer_fields: formatted_fields = swsscommon.FieldValuePairs(list(aer_fields.items())) - self.device_table.set(device_name, formatted_fields) + self.device_table.set(self.device_name, formatted_fields) else: - self.log_debug("PCIe device {} has no AER attriutes".format(device_name)) + self.log_debug("PCIe device {} has no AER attriutes".format(self.device_name)) - # Check the PCIe devices - def check_pcie_devices(self): - try: - platform_path, _ = device_info.get_paths_to_platform_and_hwsku_dirs() - from sonic_platform_base.sonic_pcie.pcie_common import PcieUtil - platform_pcieutil = PcieUtil(platform_path) - except ImportError as e: - self.log_error("Failed to load default PcieUtil module. Error : {}".format(str(e)), True) - raise e - resultInfo = platform_pcieutil.get_pcie_check() - err = 0 + # Check the PCIe AER Stats + def check_n_update_pcie_aer_stats(self, Bus, Dev, Fn): + self.device_name = "%02x:%02x.%d" % (Bus, Dev, Fn) - for item in resultInfo: - if item["result"] == "Failed": - self.log_warning("PCIe Device: " + item["name"] + " Not Found") - err += 1 + Id = read_id_file(self.device_name) + self.aer_stats = {} + if Id is not None: + self.device_table.set(self.device_name, [('id', Id)]) + self.aer_stats = platform_pcieutil.get_pcie_aer_stats(bus=Bus, dev=Dev, func=Fn) + self.update_aer_to_statedb() + + + # Update the PCIe devices status to DB + def update_pcie_devices_status_db(self, err): if err: - self.update_state_db("PCIE_DEVICES", "status", "FAILED") - self.log_error("PCIe device status check : FAILED") + pcie_status = "FAILED" + self.log_error("PCIe device status check : {}".format(pcie_status)) else: - self.update_state_db("PCIE_DEVICES", "status", "PASSED") - self.log_info("PCIe device status check : PASSED") + pcie_status = "PASSED" + self.log_info("PCIe device status check : {}".format(pcie_status)) + fvs = swsscommon.FieldValuePairs([ + ('status', pcie_status) + ]) - # update AER-attributes to DB - for item in resultInfo: - if item["result"] == "Failed": - continue + self.status_table.set("status", fvs) - Bus = int(item["bus"], 16) - Dev = int(item["dev"], 16) - Fn = int(item["fn"], 16) + # Check the PCIe devices + def check_pcie_devices(self): + self.resultInfo = platform_pcieutil.get_pcie_check() + err = 0 + if self.resultInfo is None: + return - device_name = "%02x:%02x.%d" % (Bus, Dev, Fn) - dev_id_path = '/sys/bus/pci/devices/0000:%s/device' % device_name - with open(dev_id_path, 'r') as fd: - Id = fd.read().strip() + for result in self.resultInfo: + if result["result"] == "Failed": + self.log_warning("PCIe Device: " + result["name"] + " Not Found") + err += 1 + else: + Bus = int(result["bus"], 16) + Dev = int(result["dev"], 16) + Fn = int(result["fn"], 16) + # update AER-attributes to DB + self.check_n_update_pcie_aer_stats(Bus, Dev, Fn) - self.device_table.set(device_name, [('id', Id)]) - aer_stats = platform_pcieutil.get_pcie_aer_stats(bus=Bus, device=Dev, func=Fn) - self.update_aer_to_statedb(device_name, aer_stats) + # update PCIe Device Status to DB + self.update_pcie_devices_status_db(err) - def read_state_db(self, key1, key2): - return self.state_db.get('STATE_DB', key1, key2) + # Override signal handler from DaemonBase + def signal_handler(self, sig, frame): + FATAL_SIGNALS = [signal.SIGINT, signal.SIGTERM] + NONFATAL_SIGNALS = [signal.SIGHUP] - def update_state_db(self, key1, key2, value): - self.state_db.set('STATE_DB', key1, key2, value) + global exit_code - # Signal handler - def signal_handler(self, sig, frame): - if sig == signal.SIGHUP: - self.log_info("Caught SIGHUP - ignoring...") - elif sig == signal.SIGINT: - self.log_info("Caught SIGINT - exiting...") - self.stop_event.set() - elif sig == signal.SIGTERM: - self.log_info("Caught SIGTERM - exiting...") + if sig in FATAL_SIGNALS: + self.log_info("Caught signal '{}' - exiting...".format(SIGNALS_TO_NAMES_DICT[sig])) + exit_code = 128 + sig # Make sure we exit with a non-zero code so that supervisor will try to restart us self.stop_event.set() + elif sig in NONFATAL_SIGNALS: + self.log_info("Caught signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) else: - self.log_warning("Caught unhandled signal '" + sig + "'") + self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig])) - # Initialize daemon - def init(self): - self.log_info("Start daemon init...") - - # Deinitialize daemon - def deinit(self): - self.log_info("Start daemon deinit...") - - # Run daemon + # Main daemon logic def run(self): - self.log_info("Starting up...") - - # Start daemon initialization sequence - self.init() - - # Start main loop - self.log_info("Start daemon main loop") - - while not self.stop_event.wait(self.timeout): - # Check the Pcie device status - self.check_pcie_devices() - - self.log_info("Stop daemon main loop") + if self.stop_event.wait(self.timeout): + # We received a fatal signal + return False - # Start daemon deinitialization sequence - self.deinit() - - self.log_info("Shutting down...") + self.check_pcie_devices() + return True # # Main ========================================================================= # @@ -174,7 +201,15 @@ class DaemonPcied(daemon_base.DaemonBase): def main(): pcied = DaemonPcied(SYSLOG_IDENTIFIER) - pcied.run() + + pcied.log_info("Starting up...") + + while pcied.run(): + pass + + pcied.log_info("Shutting down...") + + return exit_code if __name__ == '__main__': - main() + sys.exit(main()) diff --git a/sonic-pcied/setup.cfg b/sonic-pcied/setup.cfg new file mode 100644 index 000000000..b7e478982 --- /dev/null +++ b/sonic-pcied/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/sonic-pcied/setup.py b/sonic-pcied/setup.py index 0d7d32118..7c7cbc146 100644 --- a/sonic-pcied/setup.py +++ b/sonic-pcied/setup.py @@ -14,8 +14,19 @@ 'scripts/pcied', ], setup_requires=[ + 'pytest-runner', 'wheel' ], + install_requires=[ + 'enum34; python_version < "3.4"', + 'sonic-py-common', + ], + tests_requires=[ + 'mock>=2.0.0; python_version < "3.3"', + 'pytest', + 'pytest-cov', + 'sonic-platform-common' + ], classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: No Input/Output (Daemon)', @@ -29,4 +40,5 @@ 'Topic :: System :: Hardware', ], keywords='sonic SONiC PCIe pcie PCIED pcied', + test_suite='setup.get_test_suite' ) diff --git a/sonic-pcied/tests/__init__.py b/sonic-pcied/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/sonic-pcied/tests/mock_platform.py b/sonic-pcied/tests/mock_platform.py new file mode 100644 index 000000000..7d8a32a90 --- /dev/null +++ b/sonic-pcied/tests/mock_platform.py @@ -0,0 +1,48 @@ + +""" + Mock implementation of sonic_platform package for unit testing +""" + +# TODO: Clean this up once we no longer need to support Python 2 +import sys +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + + + +pcie_device_list = \ +""" +[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A'}] +""" + +pcie_check_result = \ +""" +[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}] +""" + +pcie_aer_stats = \ +""" +{'correctable': {}, 'fatal': {}, 'non_fatal': {}} +""" + +#class MockPcieUtil(PcieUtil): +class MockPcieUtil(): + def __init__(self, + pciList=pcie_device_list, + result=pcie_check_result, + aer_stats=pcie_aer_stats): + super(MockPcieUtil, self).__init__() + self._pciList = pciList + self._result = result + self._aer_stats = aer_stats + + def get_pcie_device(self): + return self._pciList + + def get_pcie_check(self): + return self._result + + def get_pcie_aer_stats(self, domain, bus, dev, fn): + return self._aer_stats diff --git a/sonic-pcied/tests/mocked_libs/swsscommon/__init__.py b/sonic-pcied/tests/mocked_libs/swsscommon/__init__.py new file mode 100644 index 000000000..012af621e --- /dev/null +++ b/sonic-pcied/tests/mocked_libs/swsscommon/__init__.py @@ -0,0 +1,5 @@ +''' + Mock implementation of swsscommon package for unit testing +''' + +from . import swsscommon diff --git a/sonic-pcied/tests/mocked_libs/swsscommon/swsscommon.py b/sonic-pcied/tests/mocked_libs/swsscommon/swsscommon.py new file mode 100644 index 000000000..6947a8601 --- /dev/null +++ b/sonic-pcied/tests/mocked_libs/swsscommon/swsscommon.py @@ -0,0 +1,54 @@ +''' + Mock implementation of swsscommon package for unit testing +''' + +STATE_DB = '' + + +class Table: + def __init__(self, db, table_name): + self.table_name = table_name + self.mock_dict = {} + + def _del(self, key): + del self.mock_dict[key] + pass + + def set(self, key, fvs): + self.mock_dict[key] = fvs.fv_dict + pass + + def get(self, key): + if key in self.mock_dict: + return self.mock_dict[key] + return None + + def get_size(self): + return (len(self.mock_dict)) + + +class FieldValuePairs: + fv_dict = {} + + def __init__(self, tuple_list): + if isinstance(tuple_list, list) and isinstance(tuple_list[0], tuple): + self.fv_dict = dict(tuple_list) + + def __setitem__(self, key, kv_tuple): + self.fv_dict[kv_tuple[0]] = kv_tuple[1] + + def __getitem__(self, key): + return self.fv_dict[key] + + def __eq__(self, other): + if not isinstance(other, FieldValuePairs): + # don't attempt to compare against unrelated types + return NotImplemented + + return self.fv_dict == other.fv_dict + + def __repr__(self): + return repr(self.fv_dict) + + def __str__(self): + return repr(self.fv_dict) diff --git a/sonic-pcied/tests/test_DaemonPcied.py b/sonic-pcied/tests/test_DaemonPcied.py new file mode 100644 index 000000000..24044767e --- /dev/null +++ b/sonic-pcied/tests/test_DaemonPcied.py @@ -0,0 +1,226 @@ +import datetime +import os +import sys +from imp import load_source # Replace with importlib once we no longer need to support Python 2 + +import pytest + +# TODO: Clean this up once we no longer need to support Python 2 +if sys.version_info.major == 3: + from unittest import mock +else: + import mock + +from sonic_py_common import daemon_base + +from .mock_platform import MockPcieUtil + +SYSLOG_IDENTIFIER = 'pcied_test' +NOT_AVAILABLE = 'N/A' + +daemon_base.db_connect = mock.MagicMock() + +tests_path = os.path.dirname(os.path.abspath(__file__)) + +# Add mocked_libs path so that the file under test can load mocked modules from there +mocked_libs_path = os.path.join(tests_path, "mocked_libs") +sys.path.insert(0, mocked_libs_path) + +# Add path to the file under test so that we can load it +modules_path = os.path.dirname(tests_path) +scripts_path = os.path.join(modules_path, "scripts") +sys.path.insert(0, modules_path) +load_source('pcied', os.path.join(scripts_path, 'pcied')) +import pcied + +pcie_no_aer_stats = \ +""" +{'correctable': {}, 'fatal': {}, 'non_fatal': {}} +""" + +pcie_aer_stats_no_err = \ +""" +{'correctable': {'field1': '0', 'field2': '0'}, + 'fatal': {'field3': '0', 'field4': '0'}, + 'non_fatal': {'field5': '0', 'field6': '0'}} +""" + +pcie_aer_stats_err = \ +""" +{'correctable': {'field1': '1', 'field2': '0'}, + 'fatal': {'field3': '0', 'field4': '1'}, + 'non_fatal': {'field5': '0', 'field6': '1'}} +""" + +pcie_device_list = \ +""" +[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A'}, + {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B'}, + {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f13', 'name': 'PCI C'}] +""" + +pcie_check_result_no = [] + +pcie_check_result_pass = \ +""" +[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, + {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, + {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Passed'}] +""" + +pcie_check_result_fail = \ +""" +[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, + {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, + {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Failed'}] +""" + +class TestDaemonPcied(object): + """ + Test cases to cover functionality in DaemonPcied class + """ + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_signal_handler(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.stop_event.set = mock.MagicMock() + daemon_pcied.log_info = mock.MagicMock() + daemon_pcied.log_warning = mock.MagicMock() + + # Test SIGHUP + daemon_pcied.signal_handler(pcied.signal.SIGHUP, None) + assert daemon_pcied.log_info.call_count == 1 + daemon_pcied.log_info.assert_called_with("Caught signal 'SIGHUP' - ignoring...") + assert daemon_pcied.log_warning.call_count == 0 + assert daemon_pcied.stop_event.set.call_count == 0 + assert pcied.exit_code == 0 + + # Reset + daemon_pcied.log_info.reset_mock() + daemon_pcied.log_warning.reset_mock() + daemon_pcied.stop_event.set.reset_mock() + + # Test SIGINT + test_signal = pcied.signal.SIGINT + daemon_pcied.signal_handler(test_signal, None) + assert daemon_pcied.log_info.call_count == 1 + daemon_pcied.log_info.assert_called_with("Caught signal 'SIGINT' - exiting...") + assert daemon_pcied.log_warning.call_count == 0 + assert daemon_pcied.stop_event.set.call_count == 1 + assert pcied.exit_code == (128 + test_signal) + + # Reset + daemon_pcied.log_info.reset_mock() + daemon_pcied.log_warning.reset_mock() + daemon_pcied.stop_event.set.reset_mock() + + # Test SIGTERM + test_signal = pcied.signal.SIGTERM + daemon_pcied.signal_handler(test_signal, None) + assert daemon_pcied.log_info.call_count == 1 + daemon_pcied.log_info.assert_called_with("Caught signal 'SIGTERM' - exiting...") + assert daemon_pcied.log_warning.call_count == 0 + assert daemon_pcied.stop_event.set.call_count == 1 + assert pcied.exit_code == (128 + test_signal) + + # Reset + daemon_pcied.log_info.reset_mock() + daemon_pcied.log_warning.reset_mock() + daemon_pcied.stop_event.set.reset_mock() + pcied.exit_code = 0 + + # Test an unhandled signal + daemon_pcied.signal_handler(pcied.signal.SIGUSR1, None) + assert daemon_pcied.log_warning.call_count == 1 + daemon_pcied.log_warning.assert_called_with("Caught unhandled signal 'SIGUSR1' - ignoring...") + assert daemon_pcied.log_info.call_count == 0 + assert daemon_pcied.stop_event.set.call_count == 0 + assert pcied.exit_code == 0 + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_run(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.check_pcie_devices = mock.MagicMock() + + daemon_pcied.run() + assert daemon_pcied.check_pcie_devices.call_count == 1 + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.update_pcie_devices_status_db = mock.MagicMock() + daemon_pcied.check_n_update_pcie_aer_stats = mock.MagicMock() + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock() + + daemon_pcied.check_pcie_devices() + assert daemon_pcied.update_pcie_devices_status_db.call_count == 1 + assert daemon_pcied.check_n_update_pcie_aer_stats.call_count == 0 + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_update_pcie_devices_status_db(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.status_table = mock.MagicMock() + daemon_pcied.log_info = mock.MagicMock() + daemon_pcied.log_error = mock.MagicMock() + + # test for pass resultInfo + daemon_pcied.update_pcie_devices_status_db(0) + assert daemon_pcied.status_table.set.call_count == 1 + assert daemon_pcied.log_info.call_count == 1 + assert daemon_pcied.log_error.call_count == 0 + + daemon_pcied.status_table.set.reset_mock() + daemon_pcied.log_info.reset_mock() + + # test for resultInfo with 1 device failed to detect + daemon_pcied.update_pcie_devices_status_db(1) + assert daemon_pcied.status_table.set.call_count == 1 + assert daemon_pcied.log_info.call_count == 0 + assert daemon_pcied.log_error.call_count == 1 + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + @mock.patch('pcied.read_id_file') + def test_check_n_update_pcie_aer_stats(self, mock_read): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.device_table = mock.MagicMock() + daemon_pcied.update_aer_to_statedb = mock.MagicMock() + pcied.platform_pcieutil.get_pcie_aer_stats = mock.MagicMock() + + mock_read.return_value = None + daemon_pcied.check_n_update_pcie_aer_stats(0,1,0) + assert daemon_pcied.update_aer_to_statedb.call_count == 0 + assert daemon_pcied.device_table.set.call_count == 0 + assert pcied.platform_pcieutil.get_pcie_aer_stats.call_count == 0 + + mock_read.return_value = '1714' + daemon_pcied.check_n_update_pcie_aer_stats(0,1,0) + assert daemon_pcied.update_aer_to_statedb.call_count == 1 + assert daemon_pcied.device_table.set.call_count == 1 + assert pcied.platform_pcieutil.get_pcie_aer_stats.call_count == 1 + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_update_aer_to_statedb(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + daemon_pcied.log_debug = mock.MagicMock() + daemon_pcied.device_table = mock.MagicMock() + daemon_pcied.device_name = mock.MagicMock() + daemon_pcied.aer_stats = mock.MagicMock() + + + mocked_expected_fvp = pcied.swsscommon.FieldValuePairs( + [("correctable|field1", '0'), + ("correctable|field2", '0'), + ("fatal|field3", '0'), + ("fatal|field4", '0'), + ("non_fatal|field5", '0'), + ("non_fatal|field6", '0'), + ]) + + daemon_pcied.update_aer_to_statedb() + assert daemon_pcied.log_debug.call_count == 1 + assert daemon_pcied.device_table.set.call_count == 0 + + daemon_pcied.device_table.set.reset_mock() diff --git a/sonic-pcied/tests/test_pcied.py b/sonic-pcied/tests/test_pcied.py new file mode 100644 index 000000000..f3b3e78e9 --- /dev/null +++ b/sonic-pcied/tests/test_pcied.py @@ -0,0 +1,43 @@ +import os +import sys +from imp import load_source # Replace with importlib once we no longer need to support Python 2 + +import pytest + +# TODO: Clean this up once we no longer need to support Python 2 +if sys.version_info.major == 3: + from unittest import mock +else: + import mock +from sonic_py_common import daemon_base, device_info + +from .mock_platform import MockPcieUtil + +tests_path = os.path.dirname(os.path.abspath(__file__)) + +# Add mocked_libs path so that the file under test can load mocked modules from there +mocked_libs_path = os.path.join(tests_path, "mocked_libs") +sys.path.insert(0, mocked_libs_path) + +# Add path to the file under test so that we can load it +modules_path = os.path.dirname(tests_path) +scripts_path = os.path.join(modules_path, "scripts") +sys.path.insert(0, modules_path) +load_source('pcied', os.path.join(scripts_path, 'pcied')) +import pcied + + +daemon_base.db_connect = mock.MagicMock() + + +SYSLOG_IDENTIFIER = 'pcied_test' +NOT_AVAILABLE = 'N/A' + + +@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) +@mock.patch('pcied.DaemonPcied.run') +def test_main(mock_run): + mock_run.return_value = False + + pcied.main() + assert mock_run.call_count == 1 From 53639ded1ec5a5d534f8f571dd7d7aa664ba086c Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 23 Jun 2021 01:28:08 +0800 Subject: [PATCH 15/17] [xcvrd] Add bitmap support for SFP error event (#184) Support SFP error bitmap. Currently, SONiC use a single value to represent SFP error, however, multiple SFP errors could exist at the same time. This PR is aimed to support it Signed-off-by: Stephen Sun --- sonic-xcvrd/tests/test_xcvrd.py | 41 +++++-- sonic-xcvrd/xcvrd/xcvrd.py | 109 ++++++++---------- .../xcvrd_utilities/sfp_status_helper.py | 37 ++++++ .../xcvrd/xcvrd_utilities/y_cable_helper.py | 34 ++---- 4 files changed, 128 insertions(+), 93 deletions(-) create mode 100644 sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py diff --git a/sonic-xcvrd/tests/test_xcvrd.py b/sonic-xcvrd/tests/test_xcvrd.py index 11f759f01..507890162 100644 --- a/sonic-xcvrd/tests/test_xcvrd.py +++ b/sonic-xcvrd/tests/test_xcvrd.py @@ -1,10 +1,7 @@ import os import sys -import subprocess -import pytest import unittest -from imp import load_source if sys.version_info >= (3, 3): from unittest.mock import MagicMock, patch else: @@ -12,6 +9,7 @@ from sonic_py_common import daemon_base from swsscommon import swsscommon +from sonic_platform_base.sfp_base import SfpBase from .mock_swsscommon import Table @@ -24,13 +22,12 @@ test_path = os.path.dirname(os.path.abspath(__file__)) modules_path = os.path.dirname(test_path) scripts_path = os.path.join(modules_path, "xcvrd") -helper_file_path = os.path.join(scripts_path, "xcvrd_utilities"+"/y_cable_helper.py") sys.path.insert(0, modules_path) os.environ["XCVRD_UNIT_TESTING"] = "1" -load_source('y_cable_helper', scripts_path + '/xcvrd_utilities/y_cable_helper.py') -from y_cable_helper import * from xcvrd.xcvrd import * +from xcvrd.xcvrd_utilities.y_cable_helper import * +from xcvrd.xcvrd_utilities.sfp_status_helper import * class TestXcvrdScript(object): @@ -219,9 +216,9 @@ def test_init_port_sfp_status_tbl(self): init_port_sfp_status_tbl(stop_event) @patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0])) - @patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0])) - @patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True)) - @patch('y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self', + @patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0])) + @patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True)) + @patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_info', MagicMock(return_value={'tor_active': 'self', 'mux_direction': 'self', 'manual_switch_count': '7', 'auto_switch_count': '71', @@ -258,9 +255,9 @@ def test_post_port_mux_info_to_db(self): assert(rc != -1) @patch('xcvrd.xcvrd_utilities.y_cable_helper.y_cable_platform_sfputil', MagicMock(return_value=[0])) - @patch('y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0])) - @patch('y_cable_helper._wrapper_get_presence', MagicMock(return_value=True)) - @patch('y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self', + @patch('xcvrd.xcvrd_utilities.y_cable_helper.logical_port_name_to_physical_port_list', MagicMock(return_value=[0])) + @patch('xcvrd.xcvrd_utilities.y_cable_helper._wrapper_get_presence', MagicMock(return_value=True)) + @patch('xcvrd.xcvrd_utilities.y_cable_helper.get_muxcable_static_info', MagicMock(return_value={'read_side': 'self', 'nic_lane1_precursor1': '1', 'nic_lane1_precursor2': '-7', 'nic_lane1_maincursor': '-1', @@ -318,3 +315,23 @@ def test_get_media_settings_key(self): result = get_media_settings_key(0, xcvr_info_dict) assert result == ['MOLEX-1064141421', 'QSFP+-*'] # TODO: Ensure that error message was logged + + def test_detect_port_in_error_status(self): + class MockTable: + def get(self, key): + pass + + status_tbl = MockTable() + status_tbl.get = MagicMock(return_value=(True, {'error': 'N/A'})) + assert not detect_port_in_error_status(None, status_tbl) + + status_tbl.get = MagicMock(return_value=(True, {'error': SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING})) + assert detect_port_in_error_status(None, status_tbl) + + def test_is_error_sfp_status(self): + error_values = [7, 11, 19, 35] + for error_value in error_values: + assert is_error_block_eeprom_reading(error_value) + + assert not is_error_block_eeprom_reading(int(SFP_STATUS_INSERTED)) + assert not is_error_block_eeprom_reading(int(SFP_STATUS_REMOVED)) diff --git a/sonic-xcvrd/xcvrd/xcvrd.py b/sonic-xcvrd/xcvrd/xcvrd.py index e17237f45..563145a8e 100644 --- a/sonic-xcvrd/xcvrd/xcvrd.py +++ b/sonic-xcvrd/xcvrd/xcvrd.py @@ -15,11 +15,11 @@ import threading import time - from enum import Enum from sonic_py_common import daemon_base, device_info, logger from sonic_py_common import multi_asic from swsscommon import swsscommon + from .xcvrd_utilities import sfp_status_helper from .xcvrd_utilities import y_cable_helper except ImportError as e: raise ImportError(str(e) + " - required module not found") @@ -43,18 +43,6 @@ TIME_FOR_SFP_READY_SECS = 1 XCVRD_MAIN_THREAD_SLEEP_SECS = 60 -# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase -SFP_STATUS_REMOVED = '0' -SFP_STATUS_INSERTED = '1' - -# SFP error code enum, new elements can be added to the enum if new errors need to be supported. -SFP_STATUS_ERR_ENUM = Enum('SFP_STATUS_ERR_ENUM', ['SFP_STATUS_ERR_I2C_STUCK', 'SFP_STATUS_ERR_BAD_EEPROM', - 'SFP_STATUS_ERR_UNSUPPORTED_CABLE', 'SFP_STATUS_ERR_HIGH_TEMP', - 'SFP_STATUS_ERR_BAD_CABLE'], start=2) - -# Convert the error code to string and store them in a set for convenience -errors_block_eeprom_reading = set(str(error_code.value) for error_code in SFP_STATUS_ERR_ENUM) - EVENT_ON_ALL_SFP = '-1' # events definition SYSTEM_NOT_READY = 'system_not_ready' @@ -188,11 +176,13 @@ def _wrapper_get_transceiver_change_event(timeout): if platform_chassis is not None: try: status, events = platform_chassis.get_change_event(timeout) - sfp_events = events['sfp'] - return status, sfp_events + sfp_events = events.get('sfp') + sfp_errors = events.get('sfp_error') + return status, sfp_events, sfp_errors except NotImplementedError: pass - return platform_sfputil.get_transceiver_change_event(timeout) + status, events = platform_sfputil.get_transceiver_change_event(timeout) + return status, events, None def _wrapper_get_sfp_type(physical_port): @@ -203,6 +193,14 @@ def _wrapper_get_sfp_type(physical_port): pass return None + +def _wrapper_get_sfp_error_description(physical_port): + if platform_chassis: + try: + return platform_chassis.get_sfp(physical_port).get_error_description() + except NotImplementedError: + pass + return None # Remove unnecessary unit from the raw data @@ -553,7 +551,7 @@ def recover_missing_sfp_table_entries(sfp_util, int_tbl, status_tbl, stop_event) continue keys = int_tbl[asic_index].getKeys() - if logical_port_name not in keys and not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]): + if logical_port_name not in keys and not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]): post_port_sfp_info_to_db(logical_port_name, int_tbl[asic_index], transceiver_dict, stop_event) @@ -791,30 +789,17 @@ def waiting_time_compensation_with_sleep(time_start, time_to_wait): # Update port SFP status table on receiving SFP change event -def update_port_transceiver_status_table(logical_port_name, status_tbl, status): - fvs = swsscommon.FieldValuePairs([('status', status)]) +def update_port_transceiver_status_table(logical_port_name, status_tbl, status, error_descriptions='N/A'): + fvs = swsscommon.FieldValuePairs([('status', status), ('error', error_descriptions)]) status_tbl.set(logical_port_name, fvs) + # Delete port from SFP status table def delete_port_from_status_table(logical_port_name, status_tbl): status_tbl._del(logical_port_name) -# Check whether port in error status - - -def detect_port_in_error_status(logical_port_name, status_tbl): - rec, fvp = status_tbl.get(logical_port_name) - if rec: - status_dict = dict(fvp) - if status_dict['status'] in errors_block_eeprom_reading: - return True - else: - return False - else: - return False - # Init TRANSCEIVER_STATUS table @@ -844,16 +829,16 @@ def init_port_sfp_status_tbl(stop_event=threading.Event()): physical_port_list = logical_port_name_to_physical_port_list(logical_port_name) if physical_port_list is None: helper_logger.log_error("No physical ports found for logical port '{}'".format(logical_port_name)) - update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED) + update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED) for physical_port in physical_port_list: if stop_event.is_set(): break if not _wrapper_get_presence(physical_port): - update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_REMOVED) + update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED) else: - update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], SFP_STATUS_INSERTED) + update_port_transceiver_status_table(logical_port_name, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED) # # Helper classes =============================================================== @@ -892,7 +877,7 @@ def task_worker(self, y_cable_presence): logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name)) continue - if not detect_port_in_error_status(logical_port_name, status_tbl[asic_index]): + if not sfp_status_helper.detect_port_in_error_status(logical_port_name, status_tbl[asic_index]): post_port_dom_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event) post_port_dom_threshold_info_to_db(logical_port_name, dom_tbl[asic_index], self.task_stopping_event) if y_cable_presence[0] is True: @@ -1035,7 +1020,7 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence): while not stopping_event.is_set(): next_state = state time_start = time.time() - status, port_dict = _wrapper_get_transceiver_change_event(timeout) + status, port_dict, error_dict = _wrapper_get_transceiver_change_event(timeout) if not port_dict: continue helper_logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state)) @@ -1095,11 +1080,11 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence): logger.log_warning("Got invalid asic index for {}, ignored".format(logical_port)) continue - if value == SFP_STATUS_INSERTED: + if value == sfp_status_helper.SFP_STATUS_INSERTED: helper_logger.log_info("Got SFP inserted event") # A plugin event will clear the error state. update_port_transceiver_status_table( - logical_port, status_tbl[asic_index], SFP_STATUS_INSERTED) + logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_INSERTED) helper_logger.log_info("receive plug in and update port sfp status table.") rc = post_port_sfp_info_to_db(logical_port, int_tbl[asic_index], transceiver_dict) # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again. @@ -1111,28 +1096,36 @@ def task_worker(self, stopping_event, sfp_error_event, y_cable_presence): post_port_dom_threshold_info_to_db(logical_port, dom_tbl[asic_index]) notify_media_setting(logical_port, transceiver_dict, app_port_tbl[asic_index]) transceiver_dict.clear() - elif value == SFP_STATUS_REMOVED: + elif value == sfp_status_helper.SFP_STATUS_REMOVED: helper_logger.log_info("Got SFP removed event") update_port_transceiver_status_table( - logical_port, status_tbl[asic_index], SFP_STATUS_REMOVED) - helper_logger.log_info("receive plug out and pdate port sfp status table.") + logical_port, status_tbl[asic_index], sfp_status_helper.SFP_STATUS_REMOVED) + helper_logger.log_info("receive plug out and update port sfp status table.") del_port_sfp_dom_info_from_db(logical_port, int_tbl[asic_index], dom_tbl[asic_index]) - elif value in errors_block_eeprom_reading: - helper_logger.log_info("Got SFP Error event") - # Add port to error table to stop accessing eeprom of it - # If the port already in the error table, the stored error code will - # be updated to the new one. - update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value) - helper_logger.log_info("receive error update port sfp status table.") - # In this case EEPROM is not accessible, so remove the DOM info - # since it will be outdated if long time no update. - # but will keep the interface info in the DB since it static. - del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index]) - else: - # SFP return unkown event, just ignore for now. - helper_logger.log_warning("Got unknown event {}, ignored".format(value)) - continue + try: + error_bits = int(value) + helper_logger.log_info("Got SFP error event {}".format(value)) + + error_descriptions = sfp_status_helper.fetch_generic_error_description(error_bits) + + if sfp_status_helper.has_vendor_specific_error(error_bits): + if error_dict: + vendor_specific_error_description = error_dict.get(key) + else: + vendor_specific_error_description = _wrapper_get_sfp_error_description(key) + error_descriptions.append(vendor_specific_error_description) + + # Add error info to database + # Any existing error will be replaced by the new one. + update_port_transceiver_status_table(logical_port, status_tbl[asic_index], value, '|'.join(error_descriptions)) + helper_logger.log_info("Receive error update port sfp status table.") + # In this case EEPROM is not accessible. The DOM info will be removed since it can be out-of-date. + # The interface info remains in the DB since it is static. + if sfp_status_helper.is_error_block_eeprom_reading(error_bits): + del_port_sfp_dom_info_from_db(logical_port, None, dom_tbl[asic_index]) + except (TypeError, ValueError) as e: + logger.log_error("Got unrecognized event {}, ignored".format(value)) # Since ports could be connected to a mux cable, if there is a change event process the change for being on a Y cable Port y_cable_helper.change_ports_status_for_y_cable_change_event( diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py new file mode 100644 index 000000000..789b761e4 --- /dev/null +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/sfp_status_helper.py @@ -0,0 +1,37 @@ +from sonic_platform_base.sfp_base import SfpBase + +# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase +SFP_STATUS_REMOVED = '0' +SFP_STATUS_INSERTED = '1' + +# SFP error code dictinary, new elements can be added if new errors need to be supported. +SFP_ERRORS_BLOCKING_MASK = 0x02 +SFP_ERRORS_GENERIC_MASK = 0x0000FFFE +SFP_ERRORS_VENDOR_SPECIFIC_MASK = 0xFFFF0000 + +def is_error_block_eeprom_reading(error_bits): + return 0 != (error_bits & SFP_ERRORS_BLOCKING_MASK) + + +def has_vendor_specific_error(error_bits): + return 0 != (error_bits & SFP_ERRORS_VENDOR_SPECIFIC_MASK) + + +def fetch_generic_error_description(error_bits): + generic_error_bits = (error_bits & SFP_ERRORS_GENERIC_MASK) + error_descriptions = [] + if generic_error_bits: + for error_bit, error_description in SfpBase.SFP_ERROR_BIT_TO_DESCRIPTION_DICT.items(): + if error_bit & generic_error_bits: + error_descriptions.append(error_description) + return error_descriptions + + +def detect_port_in_error_status(logical_port_name, status_tbl): + rec, fvp = status_tbl.get(logical_port_name) + if rec: + status_dict = dict(fvp) + error = status_dict.get('error') + return SfpBase.SFP_ERROR_DESCRIPTION_BLOCKING in error + return False + diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py index ac2fc4e0d..0857d8323 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py @@ -10,6 +10,7 @@ from sonic_py_common import multi_asic from sonic_y_cable import y_cable from swsscommon import swsscommon +from . import sfp_status_helper SELECT_TIMEOUT = 1000 @@ -21,27 +22,6 @@ helper_logger = logger.Logger(SYSLOG_IDENTIFIER) - -# SFP status definition, shall be aligned with the definition in get_change_event() of ChassisBase -SFP_STATUS_REMOVED = '0' -SFP_STATUS_INSERTED = '1' - -# SFP error codes, stored as strings. Can add more as needed. -SFP_STATUS_ERR_I2C_STUCK = '2' -SFP_STATUS_ERR_BAD_EEPROM = '3' -SFP_STATUS_ERR_UNSUPPORTED_CABLE = '4' -SFP_STATUS_ERR_HIGH_TEMP = '5' -SFP_STATUS_ERR_BAD_CABLE = '6' - -# Store the error codes in a set for convenience -errors_block_eeprom_reading = { - SFP_STATUS_ERR_I2C_STUCK, - SFP_STATUS_ERR_BAD_EEPROM, - SFP_STATUS_ERR_UNSUPPORTED_CABLE, - SFP_STATUS_ERR_HIGH_TEMP, - SFP_STATUS_ERR_BAD_CABLE -} - Y_CABLE_STATUS_NO_TOR_ACTIVE = 0 Y_CABLE_STATUS_TORA_ACTIVE = 1 Y_CABLE_STATUS_TORB_ACTIVE = 2 @@ -438,15 +418,23 @@ def change_ports_status_for_y_cable_change_event(port_dict, y_cable_presence, st continue if logical_port_name in port_table_keys[asic_index]: - if value == SFP_STATUS_INSERTED: + if value == sfp_status_helper.SFP_STATUS_INSERTED: helper_logger.log_info("Got SFP inserted event") check_identifier_presence_and_update_mux_table_entry( state_db, port_tbl, y_cable_tbl, static_tbl, mux_tbl, asic_index, logical_port_name, y_cable_presence) - elif value == SFP_STATUS_REMOVED or value in errors_block_eeprom_reading: + elif value == sfp_status_helper.SFP_STATUS_REMOVED: check_identifier_presence_and_delete_mux_table_entry( state_db, port_tbl, asic_index, logical_port_name, y_cable_presence, delete_change_event) else: + try: + # Now that the value is in bitmap format, let's convert it to number + event_bits = int(value) + if sfp_status_helper.is_error_block_eeprom_reading(event_bits): + check_identifier_presence_and_delete_mux_table_entry( + state_db, port_tbl, asic_index, logical_port_name, y_cable_presence, delete_change_event) + except: + pass # SFP return unkown event, just ignore for now. helper_logger.log_warning("Got unknown event {}, ignored".format(value)) continue From b2c610254c756529440f66a8279c624b5a6fe921 Mon Sep 17 00:00:00 2001 From: ngoc-do <72221689+ngoc-do@users.noreply.github.com> Date: Wed, 23 Jun 2021 12:05:39 -0700 Subject: [PATCH 16/17] Collect asic info and store in CHASSIS_STATE_DB (#175) Chassisd inside pmon queries asic information from platform modules and stores it into CHASSIS_STATE_DB of redis_chassis. The reason we keep information in CHASSIS_STATE_DB because it's accessible by swss containers. Asic information includes asic pci address, module name, and asic id in the module. For example, CHASSIS_STATE_DB will look like below: 127.0.0.1:6379[6]> HGETALL "CHASSIS_ASIC_TABLE|ASIC0" 1) "asic_pci_address" 2) "0000:09:00.0" 3) "module_name" 4) "FABRIC-CARD0" 5) "asic_id_in_module" 6) "0" --- sonic-chassisd/scripts/chassisd | 42 +++++++++++++- sonic-chassisd/tests/mock_platform.py | 9 ++- sonic-chassisd/tests/mock_swsscommon.py | 3 + sonic-chassisd/tests/test_chassisd.py | 75 +++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 3 deletions(-) diff --git a/sonic-chassisd/scripts/chassisd b/sonic-chassisd/scripts/chassisd index 355382911..771b73f5a 100644 --- a/sonic-chassisd/scripts/chassisd +++ b/sonic-chassisd/scripts/chassisd @@ -44,6 +44,13 @@ CHASSIS_MODULE_INFO_NAME_FIELD = 'name' CHASSIS_MODULE_INFO_DESC_FIELD = 'desc' CHASSIS_MODULE_INFO_SLOT_FIELD = 'slot' CHASSIS_MODULE_INFO_OPERSTATUS_FIELD = 'oper_status' +CHASSIS_MODULE_INFO_NUM_ASICS_FIELD = 'num_asics' +CHASSIS_MODULE_INFO_ASICS = 'asics' + +CHASSIS_ASIC_INFO_TABLE = 'CHASSIS_ASIC_TABLE' +CHASSIS_ASIC = 'asic' +CHASSIS_ASIC_PCI_ADDRESS_FIELD = 'asic_pci_address' +CHASSIS_ASIC_ID_IN_MODULE_FIELD = 'asic_id_in_module' CHASSIS_MIDPLANE_INFO_TABLE = 'CHASSIS_MIDPLANE_TABLE' CHASSIS_MIDPLANE_INFO_KEY_TEMPLATE = 'CHASSIS_MIDPLANE {}' @@ -163,6 +170,9 @@ class ModuleUpdater(logger.Logger): CHASSIS_MODULE_INFO_SLOT_FIELD, CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] + chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") + self.asic_table = swsscommon.Table(chassis_state_db, CHASSIS_ASIC_INFO_TABLE) + self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False) if not self.midplane_initialized: self.log_error("Chassisd midplane intialization failed") @@ -182,6 +192,11 @@ class ModuleUpdater(logger.Logger): if self.chassis_table is not None: self.chassis_table._del(CHASSIS_INFO_KEY_TEMPLATE.format(1)) + if self.asic_table is not None: + asics = list(self.asic_table.getKeys()) + for asic in asics: + self.asic_table._del(asic) + def modules_num_update(self): # Check if module list is populated num_modules = self.chassis.get_num_modules() @@ -194,6 +209,8 @@ class ModuleUpdater(logger.Logger): self.chassis_table.set(CHASSIS_INFO_KEY_TEMPLATE.format(1), fvs) def module_db_update(self): + notOnlineModules = [] + for module_index in range(0, self.num_modules): module_info_dict = self._get_module_info(module_index) if module_info_dict is not None: @@ -211,9 +228,29 @@ class ModuleUpdater(logger.Logger): fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_INFO_DESC_FIELD, module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD]), (CHASSIS_MODULE_INFO_SLOT_FIELD, module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD]), - (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD])]) + (CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]), + (CHASSIS_MODULE_INFO_NUM_ASICS_FIELD, str(len(module_info_dict[CHASSIS_MODULE_INFO_ASICS])))]) self.module_table.set(key, fvs) + if module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] != str(ModuleBase.MODULE_STATUS_ONLINE): + notOnlineModules.append(key) + continue + + for asic_id, asic in enumerate(module_info_dict[CHASSIS_MODULE_INFO_ASICS]): + asic_global_id, asic_pci_addr = asic + asic_key = "%s%s" % (CHASSIS_ASIC, asic_global_id) + asic_fvs = swsscommon.FieldValuePairs([(CHASSIS_ASIC_PCI_ADDRESS_FIELD, asic_pci_addr), + (CHASSIS_MODULE_INFO_NAME_FIELD, key), + (CHASSIS_ASIC_ID_IN_MODULE_FIELD, str(asic_id))]) + self.asic_table.set(asic_key, asic_fvs) + + # Asics that are on the "not online" modules need to be cleaned up + asics = list(self.asic_table.getKeys()) + for asic in asics: + fvs = self.asic_table.get(asic) + if fvs[CHASSIS_MODULE_INFO_NAME_FIELD] in notOnlineModules: + self.asic_table._del(asic) + def _get_module_info(self, module_index): """ Retrieves module info of this module @@ -225,11 +262,14 @@ class ModuleUpdater(logger.Logger): slot = try_get(self.chassis.get_module(module_index).get_slot, default=INVALID_SLOT) status = try_get(self.chassis.get_module(module_index).get_oper_status, default=ModuleBase.MODULE_STATUS_OFFLINE) + asics = try_get(self.chassis.get_module(module_index).get_all_asics, + default=[]) module_info_dict[CHASSIS_MODULE_INFO_NAME_FIELD] = name module_info_dict[CHASSIS_MODULE_INFO_DESC_FIELD] = str(desc) module_info_dict[CHASSIS_MODULE_INFO_SLOT_FIELD] = str(slot) module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD] = str(status) + module_info_dict[CHASSIS_MODULE_INFO_ASICS] = asics return module_info_dict diff --git a/sonic-chassisd/tests/mock_platform.py b/sonic-chassisd/tests/mock_platform.py index f13356c52..a1ab20530 100644 --- a/sonic-chassisd/tests/mock_platform.py +++ b/sonic-chassisd/tests/mock_platform.py @@ -19,7 +19,8 @@ def get_serial(self): class MockModule(MockDevice): - def __init__(self, module_index, module_name, module_desc, module_type, module_slot): + def __init__(self, module_index, module_name, module_desc, module_type, module_slot, + asic_list=[]): self.module_index = module_index self.module_name = module_name self.module_desc = module_desc @@ -29,7 +30,8 @@ def __init__(self, module_index, module_name, module_desc, module_type, module_s self.admin_state = 1 self.supervisor_slot = 16 self.midplane_access = False - + self.asic_list = asic_list + def get_name(self): return self.module_name @@ -69,6 +71,9 @@ def is_midplane_reachable(self): def set_midplane_reachable(self, up): self.midplane_access = up + def get_all_asics(self): + return self.asic_list + class MockChassis: def __init__(self): self.module_list = [] diff --git a/sonic-chassisd/tests/mock_swsscommon.py b/sonic-chassisd/tests/mock_swsscommon.py index 8e9c68761..589084225 100644 --- a/sonic-chassisd/tests/mock_swsscommon.py +++ b/sonic-chassisd/tests/mock_swsscommon.py @@ -19,6 +19,9 @@ def get(self, key): return self.mock_dict[key] return None + def getKeys(self): + return list(self.mock_dict) + def size(self): return len(self.mock_dict) diff --git a/sonic-chassisd/tests/test_chassisd.py b/sonic-chassisd/tests/test_chassisd.py index ef3d2fb67..2950abd81 100644 --- a/sonic-chassisd/tests/test_chassisd.py +++ b/sonic-chassisd/tests/test_chassisd.py @@ -31,6 +31,8 @@ CHASSIS_INFO_KEY_TEMPLATE = 'CHASSIS {}' CHASSIS_INFO_CARD_NUM_FIELD = 'module_num' +CHASSIS_ASIC_PCI_ADDRESS_FIELD = 'asic_pci_address' +CHASSIS_ASIC_ID_IN_MODULE_FIELD = 'asic_id_in_module' def setup_function(): ModuleUpdater.log_notice = MagicMock() @@ -366,3 +368,76 @@ def test_midplane_presence_supervisor(): module_updater.deinit() fvs = midplane_table.get(name) assert fvs == None + +def test_asic_presence(): + chassis = MockChassis() + + #Supervisor + index = 0 + name = "SUPERVISOR0" + desc = "Supervisor card" + slot = 16 + module_type = ModuleBase.MODULE_TYPE_SUPERVISOR + supervisor = MockModule(index, name, desc, module_type, slot) + supervisor.set_midplane_ip() + chassis.module_list.append(supervisor) + + #Linecard + index = 1 + name = "LINE-CARD0" + desc = "36 port 400G card" + slot = 1 + module_type = ModuleBase.MODULE_TYPE_LINE + module = MockModule(index, name, desc, module_type, slot) + module.set_midplane_ip() + chassis.module_list.append(module) + + #Fabric-card with asics + index = 1 + name = "FABRIC-CARD0" + desc = "Switch fabric card" + slot = 17 + module_type = ModuleBase.MODULE_TYPE_FABRIC + fabric_asic_list = [("4", "0000:04:00.0"), ("5", "0000:05:00.0")] + fabric = MockModule(index, name, desc, module_type, slot, fabric_asic_list) + chassis.module_list.append(fabric) + + #Run on supervisor + module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis) + module_updater.supervisor_slot = supervisor.get_slot() + module_updater.my_slot = supervisor.get_slot() + module_updater.modules_num_update() + module_updater.module_db_update() + module_updater.check_midplane_reachability() + + #Asic presence on fabric module + fabric.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + module_updater.module_db_update() + fabric_asic_table = module_updater.asic_table + assert len(fabric_asic_table.getKeys()) == 2 + + def verify_fabric_asic(asic_name, asic_pci_address, module_name, asic_id_in_module): + fvs = fabric_asic_table.get(asic_name) + assert fvs[CHASSIS_ASIC_PCI_ADDRESS_FIELD] == asic_pci_address + assert fvs[CHASSIS_MODULE_INFO_NAME_FIELD] == module_name + assert fvs[CHASSIS_ASIC_ID_IN_MODULE_FIELD] == asic_id_in_module + + verify_fabric_asic("asic4", "0000:04:00.0", name, "0") + verify_fabric_asic("asic5", "0000:05:00.0", name, "1") + + #Card goes down and asics should be gone + fabric.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE) + module_updater.module_db_update() + assert len(fabric_asic_table.getKeys()) == 0 + + #Deinit + fabric.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE) + module_updater.module_db_update() + module_updater.deinit() + midplane_table = module_updater.midplane_table + fvs = midplane_table.get(name) + assert fvs == None + fvs = fabric_asic_table.get("asic4") + assert fvs == None + fvs = fabric_asic_table.get("asic5") + assert fvs == None From 2d2749ab77ea0cfb9b1a9a0a5c7eeffbde9daed8 Mon Sep 17 00:00:00 2001 From: vdahiya12 <67608553+vdahiya12@users.noreply.github.com> Date: Thu, 24 Jun 2021 09:28:24 -0700 Subject: [PATCH 17/17] [xcvrd] add debug logs for y_cable change events/probes (#195) This PR adds support for logging required in debugging events for change/probe requests received by xcvrd from swss and linkmgr into state DB. a typical log would look like this: Jun 17 05:55:39.302379 NOTICE pmon#xcvrd[33]: Y_CABLE_DEBUG: trying to enable/disable debug logs Jun 17 05:55:39.302379 NOTICE pmon#xcvrd[33]: Y_CABLE_DEBUG: received an event for port transition Ethernet4 Jun 17 05:55:39.302575 NOTICE pmon#xcvrd[33]: Y_CABLE_DEBUG: xcvrd trying to transition port Ethernet4 from standby to active Jun 17 05:55:39.307872 NOTICE pmon#xcvrd[33]: Y_CABLE_DEBUG: Successful in toggling mux to ToR A for port 2 where both probe events and change events are logged inside xcvrd. Motivation and Context This is required for xcvrd to log the events which it receives from other modules when we want to see the completion of events as to what triggered the event and at which stage it failed without restarting pmon/xcvrd Signed-off-by: vaibhav-dahiya --- .../xcvrd/xcvrd_utilities/y_cable_helper.py | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py index 0857d8323..4ac4b4de0 100644 --- a/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py +++ b/sonic-xcvrd/xcvrd/xcvrd_utilities/y_cable_helper.py @@ -68,6 +68,8 @@ def update_table_mux_status_for_response_tbl(table_name, status, logical_port_na fvs = swsscommon.FieldValuePairs([('response', status)]) table_name.set(logical_port_name, fvs) + helper_logger.log_debug("Y_CABLE_DEBUG: Successful in returning probe port status {}".format(logical_port_name)) + def update_table_mux_status_for_statedb_port_tbl(table_name, status, read_side, active_side, logical_port_name): fvs = swsscommon.FieldValuePairs([('state', status), @@ -78,6 +80,8 @@ def update_table_mux_status_for_statedb_port_tbl(table_name, status, read_side, def y_cable_toggle_mux_torA(physical_port): update_status = y_cable.toggle_mux_to_torA(physical_port) + + helper_logger.log_debug("Y_CABLE_DEBUG: Status of toggling mux to ToR A for port {} {}".format(physical_port, update_status)) if update_status is True: return 1 else: @@ -88,6 +92,8 @@ def y_cable_toggle_mux_torA(physical_port): def y_cable_toggle_mux_torB(physical_port): update_status = y_cable.toggle_mux_to_torB(physical_port) + + helper_logger.log_debug("Y_CABLE_DEBUG: Status of toggling mux to ToR B for port {} {}".format(physical_port, update_status)) if update_status is True: return 2 else: @@ -173,6 +179,8 @@ def update_appdb_port_mux_cable_response_table(logical_port_name, asic_index, ap helper_logger.log_warning( "Error: Could not get state for mux cable port probe command logical port {} and physical port {}".format(logical_port_name, physical_port)) + helper_logger.log_debug("Y_CABLE_DEBUG: notifying a probe for port status {} {}".format(logical_port_name, status)) + update_table_mux_status_for_response_tbl(y_cable_response_tbl[asic_index], status, logical_port_name) else: @@ -347,10 +355,12 @@ def init_ports_status_for_y_cable(platform_sfp, platform_chassis, y_cable_presen config_db, state_db, port_tbl, y_cable_tbl = {}, {}, {}, {} static_tbl, mux_tbl = {}, {} port_table_keys = {} + xcvrd_log_tbl = {} y_cable_platform_sfputil = platform_sfp y_cable_platform_chassis = platform_chassis + fvs_updated = swsscommon.FieldValuePairs([('enable_log', 'false')]) # Get the namespaces in the platform namespaces = multi_asic.get_front_end_namespaces() for namespace in namespaces: @@ -358,6 +368,8 @@ def init_ports_status_for_y_cable(platform_sfp, platform_chassis, y_cable_presen config_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) port_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "MUX_CABLE") port_table_keys[asic_id] = port_tbl[asic_id].getKeys() + xcvrd_log_tbl[asic_id] = swsscommon.Table(config_db[asic_id], "XCVRD_LOG") + xcvrd_log_tbl[asic_id].set("Y_CABLE", fvs_updated ) # Init PORT_STATUS table if ports are on Y cable logical_port_list = y_cable_platform_sfputil.logical @@ -996,10 +1008,11 @@ def __init__(self): def task_worker(self): # Connect to STATE_DB and APPL_DB and get both the HW_MUX_STATUS_TABLE info - appl_db, state_db, status_tbl, y_cable_tbl = {}, {}, {}, {} + appl_db, state_db, config_db, status_tbl, y_cable_tbl = {}, {}, {}, {}, {} y_cable_tbl_keys = {} mux_cable_command_tbl, y_cable_command_tbl = {}, {} mux_metrics_tbl = {} + xcvrd_log_tbl = {} sel = swsscommon.Select() @@ -1009,6 +1022,7 @@ def task_worker(self): # Open a handle to the Application database, in all namespaces asic_id = multi_asic.get_asic_index_from_namespace(namespace) appl_db[asic_id] = daemon_base.db_connect("APPL_DB", namespace) + config_db[asic_id] = daemon_base.db_connect("CONFIG_DB", namespace) status_tbl[asic_id] = swsscommon.SubscriberStateTable( appl_db[asic_id], swsscommon.APP_HW_MUX_CABLE_TABLE_NAME) mux_cable_command_tbl[asic_id] = swsscommon.SubscriberStateTable( @@ -1020,9 +1034,12 @@ def task_worker(self): state_db[asic_id], swsscommon.STATE_HW_MUX_CABLE_TABLE_NAME) mux_metrics_tbl[asic_id] = swsscommon.Table( state_db[asic_id], swsscommon.STATE_MUX_METRICS_TABLE_NAME) + xcvrd_log_tbl[asic_id] = swsscommon.SubscriberStateTable( + config_db[asic_id], "XCVRD_LOG") y_cable_tbl_keys[asic_id] = y_cable_tbl[asic_id].getKeys() sel.addSelectable(status_tbl[asic_id]) sel.addSelectable(mux_cable_command_tbl[asic_id]) + sel.addSelectable(xcvrd_log_tbl[asic_id]) # Listen indefinitely for changes to the HW_MUX_CABLE_TABLE in the Application DB's while True: @@ -1048,9 +1065,12 @@ def task_worker(self): while True: (port, op, fvp) = status_tbl[asic_index].pop() + if not port: break + helper_logger.log_debug("Y_CABLE_DEBUG: received an event for port transition {}".format(port)) + # entering this section signifies a start for xcvrd state # change request from swss so initiate recording in mux_metrics table time_start = datetime.datetime.utcnow().strftime("%Y-%b-%d %H:%M:%S.%f") @@ -1075,6 +1095,7 @@ def task_worker(self): old_status = mux_port_dict.get("state") read_side = mux_port_dict.get("read_side") # Now whatever is the state requested, toggle the mux appropriately + helper_logger.log_debug("Y_CABLE_DEBUG: xcvrd trying to transition port {} from {} to {}".format(port, old_status, new_status)) active_side = update_tor_active_side(read_side, new_status, port) if active_side == -1: helper_logger.log_warning("ERR: Got a change event for toggle but could not toggle the mux-direction for port {} state from {} to {}, writing unknown".format( @@ -1085,6 +1106,7 @@ def task_worker(self): ('read_side', read_side), ('active_side', str(active_side))]) y_cable_tbl[asic_index].set(port, fvs_updated) + helper_logger.log_debug("Y_CABLE_DEBUG: xcvrd successful to transition port {} from {} to {} and write back to the DB".format(port, old_status, new_status)) helper_logger.log_info("Got a change event for toggle the mux-direction active side for port {} state from {} to {}".format( port, old_status, new_status)) time_end = datetime.datetime.utcnow().strftime("%Y-%b-%d %H:%M:%S.%f") @@ -1097,8 +1119,11 @@ def task_worker(self): while True: (port_m, op_m, fvp_m) = mux_cable_command_tbl[asic_index].pop() + if not port_m: break + helper_logger.log_debug("Y_CABLE_DEBUG: received a probe for port status {}".format(port_m)) + if fvp_m: if port_m not in y_cable_tbl_keys[asic_index]: @@ -1120,6 +1145,30 @@ def task_worker(self): read_side = mux_port_dict.get("read_side") update_appdb_port_mux_cable_response_table(port_m, asic_index, appl_db, int(read_side)) + while True: + (key, op_m, fvp_m) = xcvrd_log_tbl[asic_index].pop() + + if not key: + break + + helper_logger.log_notice("Y_CABLE_DEBUG: trying to enable/disable debug logs") + if fvp_m: + + if key is "Y_CABLE": + continue + + fvp_dict = dict(fvp_m) + if "log_verbosity" in fvp_dict: + # check if xcvrd got a probe command + probe_identifier = fvp_dict["log_verbosity"] + + if probe_identifier == "debug": + helper_logger.set_min_log_priority_debug() + + elif probe_identifier == "notice": + helper_logger.set_min_log_priority_notice() + + def task_run(self): self.task_thread = threading.Thread(target=self.task_worker) self.task_thread.start()