diff --git a/orchagent/muxorch.cpp b/orchagent/muxorch.cpp index 90c4c439786..8702a24773f 100644 --- a/orchagent/muxorch.cpp +++ b/orchagent/muxorch.cpp @@ -24,6 +24,7 @@ #include "routeorch.h" #include "fdborch.h" #include "qosorch.h" +#include "pfcwdorch.h" /* Global variables */ extern Directory gDirectory; @@ -34,6 +35,7 @@ extern AclOrch *gAclOrch; extern PortsOrch *gPortsOrch; extern FdbOrch *gFdbOrch; extern QosOrch *gQosOrch; +extern PfcWdSwOrch *gPfcWdSwOrch; extern sai_object_id_t gVirtualRouterId; extern sai_object_id_t gUnderlayIfId; @@ -476,6 +478,7 @@ void MuxCable::setState(string new_state) SWSS_LOG_INFO("Changed state to %s", new_state.c_str()); mux_cb_orch_->updateMuxState(mux_name_, new_state); + return; } @@ -1495,6 +1498,10 @@ bool MuxCableOrch::addOperation(const Request& request) auto state = request.getAttrString("state"); auto mux_obj = mux_orch->getMuxCable(port_name); + /* Disable Pfc Watchdog for perfomance */ + SWSS_LOG_INFO("Disabling PFCWD for Mux Switchover on %s", port_name.c_str()); + gPfcWdSwOrch->stopWdOnAllPorts(); + try { mux_obj->setState(state); @@ -1505,6 +1512,9 @@ bool MuxCableOrch::addOperation(const Request& request) state.c_str(), port_name.c_str(), error.what()); return true; } + /* Enable Pfc Watchdog */ + SWSS_LOG_INFO("Enabling PFCWD for Mux Switchover on %s", port_name.c_str()); + gPfcWdSwOrch->startWdOnAllPorts(); SWSS_LOG_NOTICE("Mux State set to %s for port %s", state.c_str(), port_name.c_str()); @@ -1531,10 +1541,18 @@ MuxStateOrch::MuxStateOrch(DBConnector *db, const std::string& tableName) : void MuxStateOrch::updateMuxState(string portName, string muxState) { + /* Disable Pfc Watchdog for perfomance */ + SWSS_LOG_INFO("Disabling PFCWD for Mux Switchover on %s", portName.c_str()); + gPfcWdSwOrch->stopWdOnAllPorts(); + vector tuples; FieldValueTuple tuple("state", muxState); tuples.push_back(tuple); mux_state_table_.set(portName, tuples); + + /* Enable Pfc Watchdog */ + SWSS_LOG_INFO("Enabling PFCWD for Mux Switchover on %s", portName.c_str()); + gPfcWdSwOrch->startWdOnAllPorts(); } bool MuxStateOrch::addOperation(const Request& request) diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index 3ab58e4fa65..d702a33a73f 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -41,6 +41,7 @@ Directory gDirectory; NatOrch *gNatOrch; BfdOrch *gBfdOrch; QosOrch *gQosOrch; +PfcWdSwOrch *gPfcWdSwOrch; bool gIsNatSupported = false; bool gSaiRedisLogRotate = false; @@ -376,13 +377,16 @@ bool OrchDaemon::init() static const vector queueAttrIds; - m_orchList.push_back(new PfcWdSwOrch( + gPfcWdSwOrch = (PfcWdSwOrch *) + new PfcWdSwOrch( m_configDb, pfc_wd_tables, portStatIds, queueStatIds, queueAttrIds, - PFC_WD_POLL_MSECS)); + PFC_WD_POLL_MSECS); + + m_orchList.push_back(gPfcWdSwOrch); } else if ((platform == INVM_PLATFORM_SUBSTRING) || (platform == CLX_PLATFORM_SUBSTRING) @@ -421,23 +425,29 @@ bool OrchDaemon::init() if ((platform == INVM_PLATFORM_SUBSTRING) || (platform == NPS_PLATFORM_SUBSTRING) || (platform == CLX_PLATFORM_SUBSTRING)) { - m_orchList.push_back(new PfcWdSwOrch( + gPfcWdSwOrch = (PfcWdSwOrch *) + new PfcWdSwOrch( m_configDb, pfc_wd_tables, portStatIds, queueStatIds, queueAttrIds, - PFC_WD_POLL_MSECS)); + PFC_WD_POLL_MSECS); + + m_orchList.push_back(gPfcWdSwOrch); } else if (platform == BFN_PLATFORM_SUBSTRING) { - m_orchList.push_back(new PfcWdSwOrch( + gPfcWdSwOrch = (PfcWdSwOrch *) + new PfcWdSwOrch( m_configDb, pfc_wd_tables, portStatIds, queueStatIds, queueAttrIds, - PFC_WD_POLL_MSECS)); + PFC_WD_POLL_MSECS); + + m_orchList.push_back(gPfcWdSwOrch); } } else if (platform == BRCM_PLATFORM_SUBSTRING) @@ -473,13 +483,16 @@ bool OrchDaemon::init() SAI_QUEUE_ATTR_PAUSE_STATUS, }; - m_orchList.push_back(new PfcWdSwOrch( - m_configDb, - pfc_wd_tables, - portStatIds, - queueStatIds, - queueAttrIds, - PFC_WD_POLL_MSECS)); + gPfcWdSwOrch = (PfcWdSwOrch *) + new PfcWdSwOrch( + m_configDb, + pfc_wd_tables, + portStatIds, + queueStatIds, + queueAttrIds, + PFC_WD_POLL_MSECS); + + m_orchList.push_back(gPfcWdSwOrch); } else if (platform == CISCO_8000_PLATFORM_SUBSTRING) { static const vector portStatIds; @@ -494,13 +507,16 @@ bool OrchDaemon::init() SAI_QUEUE_ATTR_PAUSE_STATUS, }; - m_orchList.push_back(new PfcWdSwOrch( + gPfcWdSwOrch = (PfcWdSwOrch *) + new PfcWdSwOrch( m_configDb, pfc_wd_tables, portStatIds, queueStatIds, queueAttrIds, - PFC_WD_POLL_MSECS)); + PFC_WD_POLL_MSECS); + + m_orchList.push_back(gPfcWdSwOrch); } m_orchList.push_back(&CounterCheckOrch::getInstance(m_configDb)); diff --git a/orchagent/pfcwdorch.cpp b/orchagent/pfcwdorch.cpp index 62765ab0a11..09327874ef4 100644 --- a/orchagent/pfcwdorch.cpp +++ b/orchagent/pfcwdorch.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "pfcwdorch.h" #include "sai_serialize.h" #include "portsorch.h" @@ -754,6 +755,151 @@ bool PfcWdSwOrch::stopWdOnPort(const Port& port) return true; } +template +bool PfcWdSwOrch::continueWdOnPort(const Port& port) +{ + SWSS_LOG_ENTER(); + + sai_object_id_t queueId = SAI_NULL_OBJECT_ID; + + for (uint8_t i = 0; i < port.m_queue_ids.size(); i++) + { + queueId = port.m_queue_ids[i]; + + auto pos = m_pfcwdconfigs.find(queueId); + if (pos == m_pfcwdconfigs.end()) { + continue; + } + + SWSS_LOG_INFO("Starting Wd on port %s", port.m_alias.c_str()); + return PfcWdOrch::createEntry(port.m_alias, pos->second); + } + return true; +} + +template +bool PfcWdSwOrch::pauseWdOnPort(const Port& port) +{ + SWSS_LOG_ENTER(); + + uint8_t pfcMask = 0; + sai_object_id_t queueId = SAI_NULL_OBJECT_ID; + + for (uint8_t i = 0; i < port.m_queue_ids.size(); i++) + { + queueId = port.m_queue_ids[i]; + if ((pfcMask & (1 << i)) == 0 && m_entryMap.find(queueId) == m_entryMap.end()) + { + continue; + } + + // Store configurations for enabling on all ports next time + // This is per-queue + const string countersKey = this->getCountersTable()->getTableName() + + this->getCountersTable()->getTableNameSeparator() + + sai_serialize_object_id(queueId); + + uint32_t detection = stoi(*this->getCountersDb()->hget(countersKey, "PFC_WD_DETECTION_TIME")); + uint32_t restoration = stoi(*this->getCountersDb()->hget(countersKey, "PFC_WD_RESTORATION_TIME")); + string action = *this->getCountersDb()->hget(countersKey, "PFC_WD_ACTION"); + + vector configs; + configs.push_back(FieldValueTuple(PFC_WD_DETECTION_TIME, to_string(detection/1000))); + configs.push_back(FieldValueTuple(PFC_WD_RESTORATION_TIME, to_string(restoration/1000))); + configs.push_back(FieldValueTuple(PFC_WD_ACTION, action)); + + m_pfcwdconfigs.emplace(queueId, configs); + + // Disable PfcWd on this port + PfcWdOrch::deleteEntry(port.m_alias); + return true; + } + + return false; +} + +template +bool PfcWdSwOrch::startWdOnAllPorts() +{ + SWSS_LOG_ENTER(); + + auto allPorts = gPortsOrch->getAllPorts(); + + for (auto &it: allPorts) + { + Port port = it.second; + + if (port.m_type != Port::Type::PHY) + { + continue; + } + + continueWdOnPort(port); + } + + m_pfcwdconfigs.clear(); + + return true; +} + +template +bool PfcWdSwOrch::stopWdOnAllPorts() +{ + SWSS_LOG_ENTER(); + + auto allPorts = gPortsOrch->getAllPorts(); + for (auto &it: allPorts) + { + Port port = it.second; + + if (port.m_type != Port::Type::PHY) + { + continue; + } + + // Check if port is currently storming + if (isStormingOnPort(port)) { + SWSS_LOG_INFO("Port %s is storming, skipping", port.m_alias.c_str()); + continue; + } + + pauseWdOnPort(port); + } + + return true; +} + +template +bool PfcWdSwOrch::isStormingOnPort(Port port) { + SWSS_LOG_ENTER(); + + string status; + sai_object_id_t queueId = SAI_NULL_OBJECT_ID; + uint8_t pfcMask = 0; + + for (uint8_t i = 0; i < port.m_queue_ids.size(); i++) + { + queueId = port.m_queue_ids[i]; + if ((pfcMask & (1 << i)) == 0 && m_entryMap.find(queueId) == m_entryMap.end()) + { + continue; + } + + string countersKey = this->getCountersTable()->getTableName() + + this->getCountersTable()->getTableNameSeparator() + + sai_serialize_object_id(queueId); + + status = *this->getCountersDb()->hget(countersKey, "PFC_WD_STATUS"); + + if (status == "stormed") + { + SWSS_LOG_NOTICE("Storm active on port %s.", port.m_alias.c_str()); + return true; + } + } + return false; +} + template void PfcWdSwOrch::doTask(Consumer& consumer) { diff --git a/orchagent/pfcwdorch.h b/orchagent/pfcwdorch.h index 4013ab9ad56..84bd54ce4ca 100644 --- a/orchagent/pfcwdorch.h +++ b/orchagent/pfcwdorch.h @@ -77,6 +77,8 @@ class PfcWdSwOrch: public PfcWdOrch virtual bool startWdOnPort(const Port& port, uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action); virtual bool stopWdOnPort(const Port& port); + virtual bool startWdOnAllPorts(); + virtual bool stopWdOnAllPorts(); task_process_status createEntry(const string& key, const vector& data) override; virtual void doTask(SelectableTimer &timer); @@ -118,8 +120,14 @@ class PfcWdSwOrch: public PfcWdOrch void enableBigRedSwitchMode(); void setBigRedSwitchMode(string value); + bool isStormingOnPort(Port port); + bool continueWdOnPort(const Port& port); + bool pauseWdOnPort(const Port& port); + map m_entryMap; map m_brsEntryMap; + // Track configs during disable + map> m_pfcwdconfigs; const vector c_portStatIds; const vector c_queueStatIds; diff --git a/tests/test_mux.py b/tests/test_mux.py index e4c7a008df9..ae80c3d1fa0 100644 --- a/tests/test_mux.py +++ b/tests/test_mux.py @@ -56,6 +56,8 @@ class TestMuxTunnelBase(): PING_CMD = "timeout 0.5 ping -c1 -W1 -i0 -n -q {ip}" + PFC_WD_QUEUE_MAX = 8 + SAI_ROUTER_INTERFACE_ATTR_TYPE = "SAI_ROUTER_INTERFACE_ATTR_TYPE" SAI_ROUTER_INTERFACE_TYPE_VLAN = "SAI_ROUTER_INTERFACE_TYPE_VLAN" @@ -227,6 +229,10 @@ def check_nexthop_group_in_asic_db(self, asicdb, key, num_tnl_nh=0): assert num_tnl_nh == count + def check_syslog(self, dvs, marker, err_log, expected_cnt): + (exitcode, num) = dvs.runcmd(['sh', '-c', "awk \'/%s/,ENDFILE {print;}\' /var/log/syslog | grep \"%s\" | wc -l" % (marker, err_log)]) + assert int(num.strip()) >= int(expected_cnt) + def add_neighbor(self, dvs, ip, mac): if ip_address(ip).version == 6: dvs.runcmd("ip -6 neigh replace " + ip + " lladdr " + mac + " dev Vlan1000") @@ -598,6 +604,40 @@ def create_and_test_metrics(self, appdb, statedb): assert start assert end + def create_and_test_pfcwd(self, dvs, appdb, counters_db, config_db): + test_port = "Ethernet0" + counter_queue = 3 + + marker = dvs.add_log_marker() + + # Enabling some extra logging to catch info message + dvs.runcmd("swssloglevel -l INFO -c orchagent") + + # Do a switchover + self.set_mux_state(appdb, test_port, "active") + time.sleep(1) + + # # Check that pfc_wd was stopped + self.check_syslog(dvs, marker, "Stopped PFC Watchdog on port " + test_port, 1) + # Check that pfc_wd was started + self.check_syslog(dvs, marker, "Started PFC Watchdog on port " + test_port, 1) + + # Set a storm on port + self.set_storm_state_on_port_queue(counters_db, test_port, counter_queue, state="enabled") + # Wait for storm state to set + time.sleep(1) + + marker = dvs.add_log_marker() + + # Do a switchover + self.set_mux_state(appdb, test_port, "standby") + + # Check that pfcwd wasn't disabled on port + self.check_syslog(dvs, marker, "Storm active on port " + test_port, 1) + + # Disable extra logging + dvs.runcmd("swssloglevel -l NOTICE -c orchagent") + def check_interface_exists_in_asicdb(self, asicdb, sai_oid): asicdb.wait_for_entry(self.ASIC_RIF_TABLE, sai_oid) return True @@ -854,6 +894,78 @@ def execute_action(self, action, dvs, test_info): else: pytest.fail('Invalid test action {}'.format(action)) + def setup_pfcwd(self, dvs, config_db): + pfc_test_ports = ["Ethernet0"] + pfc_test_queue = 3 + # set cable lengths + for port in pfc_test_ports: + fvs = {port: "5m"} + config_db.update_entry("CABLE_LENGTH", "AZURE", fvs) + dvs.port_admin_set(port, "up") + + # enable pfcwd + self.set_flex_counter_status("PFCWD", "enable", config_db) + self.set_flex_counter_status("QUEUE", "enable", config_db) + + for port in pfc_test_ports: + self.set_ports_pfc(port, pfc_test_queue, config_db, "enable") + + self.start_pfcwd_on_ports(pfc_test_ports, config_db) + + def teardown_pfcwd(self, dvs, config_db): + pfc_test_ports = ["Ethernet0"] + + # disable pfcwd + self.set_flex_counter_status("PFCWD", "disable", config_db) + # disable queue + self.set_flex_counter_status("QUEUE", "disable", config_db) + + for port in pfc_test_ports: + # shutdown port + dvs.port_admin_set(port, "down") + + def set_flex_counter_status(self, key, state, config_db): + fvs = {'FLEX_COUNTER_STATUS': state} + config_db.update_entry("FLEX_COUNTER_TABLE", key, fvs) + time.sleep(1) + + def set_ports_pfc(self, port, queue, config_db, status='enable'): + keyname = 'pfcwd_sw_enable' + if 'enable' in status: + fvs = {keyname: str(queue), 'pfc_enable': str(queue)} + config_db.create_entry("PORT_QOS_MAP", port, fvs) + else: + config_db.delete_entry("PORT_QOS_MAP", port) + + def start_pfcwd_on_ports(self, test_ports, config_db, poll_interval="200", detection_time="200", restoration_time="200", action="drop"): + pfcwd_info = {"POLL_INTERVAL": poll_interval} + config_db.update_entry("PFC_WD", "GLOBAL", pfcwd_info) + + pfcwd_info = {"action": action, + "detection_time" : detection_time, + "restoration_time": restoration_time + } + + for port in test_ports: + config_db.update_entry("PFC_WD", port, pfcwd_info) + + def stop_pfcwd_on_ports(self, dvs, port): + config_db = dvs.get_config_db() + config_db.delete_entry("PFC_WD", port) + + def verify_pfcwd_state_on_port(self, counters_db, port, state="stormed"): + fvs = {"PFC_WD_STATUS": state} + queue_oids = counters_db.get_entry("COUNTERS_QUEUE_NAME_MAP", "") + for queue in range(0,self.PFC_WD_QUEUE_MAX): + queue_name = port + ":" + str(queue) + counters_db.wait_for_field_match("COUNTERS", queue_oids[queue_name], fvs) + + def set_storm_state_on_port_queue(self, counters_db, port, queue, state="enabled"): + fvs = {"DEBUG_STORM": state} + queue_oids = counters_db.get_entry("COUNTERS_QUEUE_NAME_MAP", "") + queue_name = port + ":" + str(queue) + counters_db.update_entry("COUNTERS", queue_oids[queue_name], fvs) + @pytest.fixture(scope='module') def setup_vlan(self, dvs): self.create_vlan_interface(dvs) @@ -987,6 +1099,13 @@ def intf_fdb_map(self, dvs, setup_vlan): return fdb_map + @pytest.fixture + def setup_teardown_pfcwd(self, dvs): + config_db = dvs.get_config_db() + self.setup_pfcwd(dvs, config_db) + yield + self.teardown_pfcwd(dvs, config_db) + class TestMuxTunnel(TestMuxTunnelBase): """ Tests for Mux tunnel creation and removal """ @@ -1098,6 +1217,20 @@ def test_neighbor_miss_no_peer( for ip in test_ips: self.check_neighbor_state(dvs, dvs_route, ip, expect_route=False) + def test_mux_pfcwd_switching( + self, dvs, neighbor_cleanup, setup_vlan, + setup_mux_cable, setup_tunnel, setup_peer_switch, + setup_qos_map, setup_teardown_pfcwd, testlog + ): + """ test pfcwd gets disabled, then enabled """ + appdb = swsscommon.DBConnector(swsscommon.APPL_DB, dvs.redis_sock, 0) + counters_db = dvs.get_counters_db() + config_db = dvs.get_config_db() + + self.create_and_test_pfcwd(dvs, appdb, counters_db, config_db) + + + # Add Dummy always-pass test at end as workaroud # for issue when Flaky fail on final test it invokes module tear-down before retrying