From e99beb6386e6754314c734d2de9c568a66a54f85 Mon Sep 17 00:00:00 2001 From: Shi Su <67605788+shi-su@users.noreply.github.com> Date: Wed, 7 Jul 2021 10:19:45 -0700 Subject: [PATCH] Add failure handling for SAI get operations (#1768) What I did Add failure handling for SAI get operations. The function allows handling failures in SAI get operations according to the orch type, SAI type, SAI status. Why I did it Enable custom failure handling for SAI get operations. --- orchagent/aclorch.cpp | 6 +- orchagent/copporch.cpp | 6 +- orchagent/crmorch.cpp | 12 +++- orchagent/fabricportsorch.cpp | 36 ++++++++-- orchagent/fdborch.cpp | 6 +- orchagent/fgnhgorch.cpp | 10 ++- orchagent/macsecorch.cpp | 15 ++-- orchagent/neighorch.cpp | 6 +- orchagent/orch.cpp | 29 ++++++++ orchagent/orch.h | 1 + orchagent/portsorch.cpp | 126 ++++++++++++++++++++++++++++------ orchagent/qosorch.cpp | 24 +++++-- 12 files changed, 233 insertions(+), 44 deletions(-) diff --git a/orchagent/aclorch.cpp b/orchagent/aclorch.cpp index 7f2ccfb50c..a2f5482a36 100644 --- a/orchagent/aclorch.cpp +++ b/orchagent/aclorch.cpp @@ -2309,7 +2309,11 @@ void AclOrch::init(vector& connectors, PortsOrch *portOrch, Mirr else { SWSS_LOG_ERROR("Failed to get ACL entry priority min/max values, rv:%d", status); - throw "AclOrch initialization failure"; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + throw "AclOrch initialization failure"; + } } queryAclActionCapability(); diff --git a/orchagent/copporch.cpp b/orchagent/copporch.cpp index 403fcb98d9..34d83dd274 100644 --- a/orchagent/copporch.cpp +++ b/orchagent/copporch.cpp @@ -179,7 +179,11 @@ void CoppOrch::initDefaultTrapGroup() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get default trap group, rv:%d", status); - throw "CoppOrch initialization failure"; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + throw "CoppOrch initialization failure"; + } } SWSS_LOG_INFO("Get default trap group"); diff --git a/orchagent/crmorch.cpp b/orchagent/crmorch.cpp index e0eb24239b..bdd899057a 100644 --- a/orchagent/crmorch.cpp +++ b/orchagent/crmorch.cpp @@ -488,7 +488,11 @@ void CrmOrch::getResAvailableCounters() break; } SWSS_LOG_ERROR("Failed to get switch attribute %u , rv:%d", attr.id, status); - break; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + break; + } } res.second.countersMap[CRM_COUNTERS_TABLE_KEY].availableCounter = attr.value.u32; @@ -517,7 +521,11 @@ void CrmOrch::getResAvailableCounters() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get switch attribute %u , rv:%d", attr.id, status); - break; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + break; + } } for (uint32_t i = 0; i < attr.value.aclresource.count; i++) diff --git a/orchagent/fabricportsorch.cpp b/orchagent/fabricportsorch.cpp index a4644dfffc..1adb84ec08 100644 --- a/orchagent/fabricportsorch.cpp +++ b/orchagent/fabricportsorch.cpp @@ -88,7 +88,11 @@ int FabricPortsOrch::getFabricPortList() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get fabric port number, rv:%d", status); - return FABRIC_PORT_ERROR; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + return FABRIC_PORT_ERROR; + } } m_fabricPortCount = attr.value.u32; SWSS_LOG_NOTICE("Get %d fabric ports", m_fabricPortCount); @@ -101,7 +105,11 @@ int FabricPortsOrch::getFabricPortList() status = sai_switch_api->get_switch_attribute(gSwitchId, 1, &attr); if (status != SAI_STATUS_SUCCESS) { - throw runtime_error("FabricPortsOrch get port list failure"); + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("FabricPortsOrch get port list failure"); + } } for (i = 0; i < m_fabricPortCount; i++) @@ -113,7 +121,11 @@ int FabricPortsOrch::getFabricPortList() status = sai_port_api->get_port_attribute(fabric_port_list[i], 1, &attr); if (status != SAI_STATUS_SUCCESS) { - throw runtime_error("FabricPortsOrch get port lane failure"); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("FabricPortsOrch get port lane failure"); + } } int lane = attr.value.u32list.list[0]; m_fabricLanePortMap[lane] = fabric_port_list[i]; @@ -198,7 +210,11 @@ void FabricPortsOrch::updateFabricPortState() { // Port may not be ready for query SWSS_LOG_ERROR("Failed to get fabric port (%d) status, rv:%d", lane, status); - return; + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + return; + } } if (m_portStatus.find(lane) != m_portStatus.end() && @@ -215,7 +231,11 @@ void FabricPortsOrch::updateFabricPortState() status = sai_port_api->get_port_attribute(port, 1, &attr); if (status != SAI_STATUS_SUCCESS) { - throw runtime_error("FabricPortsOrch get remote id failure"); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("FabricPortsOrch get remote id failure"); + } } remote_peer = attr.value.u32; @@ -223,7 +243,11 @@ void FabricPortsOrch::updateFabricPortState() status = sai_port_api->get_port_attribute(port, 1, &attr); if (status != SAI_STATUS_SUCCESS) { - throw runtime_error("FabricPortsOrch get remote port index failure"); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("FabricPortsOrch get remote port index failure"); + } } remote_port = attr.value.u32; } diff --git a/orchagent/fdborch.cpp b/orchagent/fdborch.cpp index af46d958ec..229dec0b15 100644 --- a/orchagent/fdborch.cpp +++ b/orchagent/fdborch.cpp @@ -503,7 +503,11 @@ bool FdbOrch::getPort(const MacAddress& mac, uint16_t vlan, Port& port) { SWSS_LOG_ERROR("Failed to get bridge port ID for FDB entry %s, rv:%d", mac.to_string().c_str(), status); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_FDB, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } if (!m_portsOrch->getPortByBridgePortId(attr.value.oid, port)) diff --git a/orchagent/fgnhgorch.cpp b/orchagent/fgnhgorch.cpp index de791678ac..4111665e09 100644 --- a/orchagent/fgnhgorch.cpp +++ b/orchagent/fgnhgorch.cpp @@ -294,11 +294,15 @@ bool FgNhgOrch::createFineGrainedNextHopGroup(FGNextHopGroupEntry &syncd_fg_rout { SWSS_LOG_ERROR("Failed to query next hop group %s SAI_NEXT_HOP_GROUP_ATTR_REAL_SIZE, rv:%d", nextHops.to_string().c_str(), status); - if (!removeFineGrainedNextHopGroup(&syncd_fg_route_entry)) + task_process_status handle_status = handleSaiGetStatus(SAI_API_NEXT_HOP_GROUP, status); + if (handle_status != task_process_status::task_success) { - SWSS_LOG_ERROR("Failed to clean-up after next hop group real_size query failure"); + if (!removeFineGrainedNextHopGroup(&syncd_fg_route_entry)) + { + SWSS_LOG_ERROR("Failed to clean-up after next hop group real_size query failure"); + } + return false; } - return false; } fgNhgEntry->real_bucket_size = nhg_attr.value.u32; } diff --git a/orchagent/macsecorch.cpp b/orchagent/macsecorch.cpp index c5510a16fa..ead1bfa81c 100644 --- a/orchagent/macsecorch.cpp +++ b/orchagent/macsecorch.cpp @@ -854,15 +854,20 @@ bool MACsecOrch::initMACsecObject(sai_object_id_t switch_id) attrs.clear(); attr.id = SAI_MACSEC_ATTR_SCI_IN_INGRESS_MACSEC_ACL; attrs.push_back(attr); - if (sai_macsec_api->get_macsec_attribute( - macsec_obj.first->second.m_ingress_id, - static_cast(attrs.size()), - attrs.data()) != SAI_STATUS_SUCCESS) + status = sai_macsec_api->get_macsec_attribute( + macsec_obj.first->second.m_ingress_id, + static_cast(attrs.size()), + attrs.data()); + if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_WARN( "Cannot get MACsec attribution SAI_MACSEC_ATTR_SCI_IN_INGRESS_MACSEC_ACL at the switch 0x%" PRIx64, switch_id); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_MACSEC, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } macsec_obj.first->second.m_sci_in_ingress_macsec_acl = attrs.front().value.booldata; diff --git a/orchagent/neighorch.cpp b/orchagent/neighorch.cpp index acd8d8718b..b9870a24eb 100644 --- a/orchagent/neighorch.cpp +++ b/orchagent/neighorch.cpp @@ -1547,7 +1547,11 @@ void NeighOrch::voqSyncAddNeigh(string &alias, IpAddress &ip_address, const MacA if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get neighbor attribute for %s on %s, rv:%d", ip_address.to_string().c_str(), alias.c_str(), status); - return; + task_process_status handle_status = handleSaiGetStatus(SAI_API_NEIGHBOR, status); + if (handle_status != task_process_status::task_success) + { + return; + } } if (!attr.value.u32) diff --git a/orchagent/orch.cpp b/orchagent/orch.cpp index 464e6aa688..14187b79b4 100644 --- a/orchagent/orch.cpp +++ b/orchagent/orch.cpp @@ -765,6 +765,35 @@ task_process_status Orch::handleSaiRemoveStatus(sai_api_t api, sai_status_t stat return task_need_retry; } +task_process_status Orch::handleSaiGetStatus(sai_api_t api, sai_status_t status, void *context) +{ + /* + * This function aims to provide coarse handling of failures in sairedis get + * operation (i.e., notify users by throwing excepions when failures happen). + * Return value: task_success - Handled the status successfully. No need to retry this SAI operation. + * task_need_retry - Cannot handle the status. Need to retry the SAI operation. + * task_failed - Failed to handle the status but another attempt is unlikely to resolve the failure. + * TODO: 1. Add general handling logic for specific statuses + * 2. Develop fine-grain failure handling mechanisms and replace this coarse handling + * in each orch. + * 3. Take the type of sai api into consideration. + */ + switch (status) + { + case SAI_STATUS_SUCCESS: + SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiGetStatus"); + return task_success; + case SAI_STATUS_NOT_IMPLEMENTED: + SWSS_LOG_ERROR("Encountered failure in get operation due to the function is not implemented, exiting orchagent, SAI API: %s", + sai_serialize_api(api).c_str()); + throw std::logic_error("SAI get function not implemented"); + default: + SWSS_LOG_ERROR("Encountered failure in get operation, SAI API: %s, status: %s", + sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str()); + } + return task_failed; +} + bool Orch::parseHandleSaiStatusFailure(task_process_status status) { /* diff --git a/orchagent/orch.h b/orchagent/orch.h index b61cdb53e2..766d02c766 100644 --- a/orchagent/orch.h +++ b/orchagent/orch.h @@ -240,6 +240,7 @@ class Orch virtual task_process_status handleSaiCreateStatus(sai_api_t api, sai_status_t status, void *context = nullptr); virtual task_process_status handleSaiSetStatus(sai_api_t api, sai_status_t status, void *context = nullptr); virtual task_process_status handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context = nullptr); + virtual task_process_status handleSaiGetStatus(sai_api_t api, sai_status_t status, void *context = nullptr); bool parseHandleSaiStatusFailure(task_process_status status); private: void removeMeFromObjsReferencedByMe(type_map &type_maps, const std::string &table, const std::string &obj_name, const std::string &field, const std::string &old_referenced_obj_name); diff --git a/orchagent/portsorch.cpp b/orchagent/portsorch.cpp index 9b22a551b5..fab6a7ed59 100755 --- a/orchagent/portsorch.cpp +++ b/orchagent/portsorch.cpp @@ -380,7 +380,11 @@ PortsOrch::PortsOrch(DBConnector *db, DBConnector *stateDb, vector tmp_lane_set; @@ -459,7 +475,11 @@ PortsOrch::PortsOrch(DBConnector *db, DBConnector *stateDb, vectorget_port_attribute(id, 1, &attr); if (status == SAI_STATUS_SUCCESS) + { speed = attr.value.u32; + } + else + { + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + return false; + } + } - return status == SAI_STATUS_SUCCESS; + return true; } bool PortsOrch::setPortAdvSpeeds(sai_object_id_t port_id, std::vector& speed_list) @@ -1994,7 +2040,11 @@ bool PortsOrch::getQueueTypeAndIndex(sai_object_id_t queue_id, string &type, uin if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get queue type and index for queue %" PRIu64 " rv:%d", queue_id, status); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_QUEUE, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } switch (attr[0].value.s32) @@ -3850,7 +3900,11 @@ void PortsOrch::initializeQueues(Port &port) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get number of queues for port %s rv:%d", port.m_alias.c_str(), status); - throw runtime_error("PortsOrch initialization failure."); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("PortsOrch initialization failure."); + } } SWSS_LOG_INFO("Get %d queues for port %s", attr.value.u32, port.m_alias.c_str()); @@ -3870,7 +3924,11 @@ void PortsOrch::initializeQueues(Port &port) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get queue list for port %s rv:%d", port.m_alias.c_str(), status); - throw runtime_error("PortsOrch initialization failure."); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("PortsOrch initialization failure."); + } } SWSS_LOG_INFO("Get queues for port %s", port.m_alias.c_str()); @@ -3886,7 +3944,11 @@ void PortsOrch::initializePriorityGroups(Port &port) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get number of priority groups for port %s rv:%d", port.m_alias.c_str(), status); - throw runtime_error("PortsOrch initialization failure."); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("PortsOrch initialization failure."); + } } SWSS_LOG_INFO("Get %d priority groups for port %s", attr.value.u32, port.m_alias.c_str()); @@ -3907,7 +3969,11 @@ void PortsOrch::initializePriorityGroups(Port &port) if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Fail to get priority group list for port %s rv:%d", port.m_alias.c_str(), status); - throw runtime_error("PortsOrch initialization failure."); + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + throw runtime_error("PortsOrch initialization failure."); + } } SWSS_LOG_INFO("Get priority groups for port %s", port.m_alias.c_str()); } @@ -5403,7 +5469,11 @@ bool PortsOrch::setPortSerdesAttribute(sai_object_id_t port_id, { SWSS_LOG_ERROR("Failed to get port attr serdes id %d to port pid:0x%" PRIx64, port_attr.id, port_id); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } if (port_attr.value.oid != SAI_NULL_OBJECT_ID) @@ -5897,7 +5967,11 @@ bool PortsOrch::getSystemPorts() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get system port list, rv:%d", status); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SWITCH, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } uint32_t spcnt = attr.value.objlist.count; @@ -5909,7 +5983,11 @@ bool PortsOrch::getSystemPorts() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get system port config info spid:%" PRIx64, system_port_list[i]); - return false; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SYSTEM_PORT, status); + if (handle_status != task_process_status::task_success) + { + return false; + } } SWSS_LOG_NOTICE("SystemPort(0x%" PRIx64 ") - port_id:%u, switch_id:%u, core:%u, core_port:%u, speed:%u, voqs:%u", @@ -6018,7 +6096,11 @@ bool PortsOrch::addSystemPorts() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get system port config info spid:%" PRIx64, system_port_oid); - continue; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SYSTEM_PORT, status); + if (handle_status != task_process_status::task_success) + { + continue; + } } //Create or update system port and add to the port list. @@ -6037,7 +6119,11 @@ bool PortsOrch::addSystemPorts() if (status != SAI_STATUS_SUCCESS) { SWSS_LOG_ERROR("Failed to get local port oid of local system port spid:%" PRIx64, system_port_oid); - continue; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SYSTEM_PORT, status); + if (handle_status != task_process_status::task_success) + { + continue; + } } //System port for local port. Update the system port info in the existing physical port diff --git a/orchagent/qosorch.cpp b/orchagent/qosorch.cpp index 31e61b5433..c2e15aa763 100644 --- a/orchagent/qosorch.cpp +++ b/orchagent/qosorch.cpp @@ -933,7 +933,11 @@ sai_object_id_t QosOrch::getSchedulerGroup(const Port &port, const sai_object_id if (SAI_STATUS_SUCCESS != sai_status) { SWSS_LOG_ERROR("Failed to get number of scheduler groups for port:%s", port.m_alias.c_str()); - return SAI_NULL_OBJECT_ID; + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, sai_status); + if (handle_status != task_process_status::task_success) + { + return SAI_NULL_OBJECT_ID; + } } /* Get total groups list on the port */ @@ -947,7 +951,11 @@ sai_object_id_t QosOrch::getSchedulerGroup(const Port &port, const sai_object_id if (SAI_STATUS_SUCCESS != sai_status) { SWSS_LOG_ERROR("Failed to get scheduler group list for port:%s", port.m_alias.c_str()); - return SAI_NULL_OBJECT_ID; + task_process_status handle_status = handleSaiGetStatus(SAI_API_PORT, sai_status); + if (handle_status != task_process_status::task_success) + { + return SAI_NULL_OBJECT_ID; + } } m_scheduler_group_port_info[port.m_port_id] = { @@ -969,7 +977,11 @@ sai_object_id_t QosOrch::getSchedulerGroup(const Port &port, const sai_object_id if (SAI_STATUS_SUCCESS != sai_status) { SWSS_LOG_ERROR("Failed to get child count for scheduler group:0x%" PRIx64 " of port:%s", group_id, port.m_alias.c_str()); - return SAI_NULL_OBJECT_ID; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SCHEDULER_GROUP, sai_status); + if (handle_status != task_process_status::task_success) + { + return SAI_NULL_OBJECT_ID; + } } uint32_t child_count = attr.value.u32; @@ -988,7 +1000,11 @@ sai_object_id_t QosOrch::getSchedulerGroup(const Port &port, const sai_object_id if (SAI_STATUS_SUCCESS != sai_status) { SWSS_LOG_ERROR("Failed to get child list for scheduler group:0x%" PRIx64 " of port:%s", group_id, port.m_alias.c_str()); - return SAI_NULL_OBJECT_ID; + task_process_status handle_status = handleSaiGetStatus(SAI_API_SCHEDULER_GROUP, sai_status); + if (handle_status != task_process_status::task_success) + { + return SAI_NULL_OBJECT_ID; + } } m_scheduler_group_port_info[port.m_port_id].child_groups[ii] = std::move(child_groups);