Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[thermalctld] Update line card thermal sensor status to DB #211

Merged
merged 2 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 124 additions & 78 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger):

self.is_chassis_system = chassis.is_modular_chassis()
if self.is_chassis_system:
self.module_thermals = set()
my_slot = try_get(chassis.get_my_slot, INVALID_SLOT)
if my_slot != INVALID_SLOT:
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
try:
# Modular chassis does not have to have table CHASSIS_STATE_DB.
# So catch the exception here and ignore it.
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
except Exception as e:
self.chassis_table = None

def deinit(self):
"""
Expand Down Expand Up @@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger):
for index, thermal in enumerate(self.chassis.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
parent_name = 'PSU {}'.format(psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
parent_name = 'SFP {}'.format(sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

if self.is_chassis_system:
available_thermals = set()
for module_index, module in enumerate(self.chassis.get_all_modules()):
module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1))

for thermal_index, thermal in enumerate(module.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, module_name, thermal_index))
self._refresh_temperature_status(module_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(module.get_all_sfps()):
sfp_name = '{} SFP {}'.format(module_name, sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, sfp_name, thermal_index))
self._refresh_temperature_status(sfp_name, thermal, thermal_index)

for psu_index, psu in enumerate(module.get_all_psus()):
psu_name = '{} PSU {}'.format(module_name, psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, psu_name, thermal_index))
self._refresh_temperature_status(psu_name, thermal, thermal_index)


thermals_to_remove = self.module_thermals - available_thermals
self.module_thermals = available_thermals
for thermal, parent_name, thermal_index in thermals_to_remove:
self._remove_thermal_from_db(thermal, parent_name, thermal_index)

self.log_debug("End temperature updating")

def _refresh_temperature_status(self, parent_name, thermal, thermal_index):
Expand All @@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger):
:param thermal_index: Index of the thermal object in platform chassis
:return:
"""
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
try:
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature
self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
except Exception as e:
self.log_warning('Failed to update thermal status for {} - {}'.format(name, repr(e)))

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
def _remove_thermal_from_db(self, thermal, parent_name, thermal_index):
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
self.table._del(name)

self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
if self.chassis_table is not None:
self.chassis_table._del(name)


class ThermalMonitor(ProcessTaskBase):
Expand Down
16 changes: 16 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,17 @@ def make_error_thermal(self):
thermal = MockErrorThermal()
self._thermal_list.append(thermal)

def make_module_thermal(self):
module = MockModule()
self._module_list.append(module)
sfp = MockSfp()
sfp._thermal_list.append(MockThermal())
psu = MockPsu()
psu._thermal_list.append(MockThermal())
module._sfp_list.append(sfp)
module._psu_list.append(psu)
module._thermal_list.append(MockThermal())

def is_modular_chassis(self):
return self._is_chassis_system

Expand Down Expand Up @@ -430,3 +441,8 @@ def get_position_in_parent(self):

def is_replaceable(self):
return self._replaceable


class MockModule(module_base.ModuleBase):
def __init__(self):
super(MockModule, self).__init__()
28 changes: 20 additions & 8 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,15 +470,15 @@ def test_update_psu_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message',)")

def test_update_sfp_thermals(self):
chassis = MockChassis()
Expand All @@ -490,15 +490,15 @@ def test_update_sfp_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message',)")

def test_update_thermal_with_exception(self):
chassis = MockChassis()
Expand All @@ -514,16 +514,28 @@ def test_update_thermal_with_exception(self):
# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature')"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature')"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
else:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature',)"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature',)"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
assert temperature_updater.log_warning.mock_calls == expected_calls

def test_update_module_thermals(self):
chassis = MockChassis()
chassis.make_module_thermal()
chassis.set_modular_chassis(True)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 3

chassis._module_list = []
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 0


# Modular chassis-related tests

Expand Down