Skip to content

Commit

Permalink
LAG keepalive script to reduce lacp session wait during warm-reboot (#…
Browse files Browse the repository at this point in the history
…2806)

A new mechanism is added here to to reduce LAG flap issue during hitless upgrades.

Problem being solved:
During warm upgrades T0 goes down and with that wait time for LACP session starts.
If the waittime to refresh LACP session is > 90s then T1 initiates LAG teardown, and as a result dataplane impact is seen.
This script makes sure that LACPDUs are sent in the going down path continuously.

How time is saved w/ this mechanism:

The lacpsession wait period earlier used to start from when teamd container goes down.
New lacpsession wait period starts when kexec in current kernel is issued, and new kernel boots up.

Implementation:
When warm-reboot starts, capture LACPDUs sent from all LAG member ports.
For this allow 60s of prep + collection time.
Start sending LACPDUs w/ ~1s interval.
The last LACPDU is sent after all containers are down and kexec is issued.

Results:
Tested this on different platforms and images. Some results for time saved:

BRCM: 201811 -> 202012 --- 18s
BRCM: 202012 -> 202012 --- 20s
MLNX: 201911 -> 202205 --- 10s
MLNX: 202205 -> 202205 --- 10s
  • Loading branch information
vaibhavhd authored and StormLiangMS committed May 11, 2023
1 parent efdc533 commit 984983e
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 0 deletions.
8 changes: 8 additions & 0 deletions scripts/fast-reboot
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ STRICT=no
REBOOT_METHOD="/sbin/kexec -e"
ASSISTANT_IP_LIST=""
ASSISTANT_SCRIPT="/usr/local/bin/neighbor_advertiser"
LAG_KEEPALIVE_SCRIPT="/usr/local/bin/lag_keepalive.py"
WATCHDOG_UTIL="/usr/local/bin/watchdogutil"
DEVPATH="/usr/share/sonic/device"
PLATFORM=$(sonic-cfggen -H -v DEVICE_METADATA.localhost.platform)
Expand Down Expand Up @@ -681,6 +682,13 @@ fi
# disable trap-handlers which were set before
trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM
# start sending LACPDUs to keep the LAGs refreshed
# this is a non-blocking call, and the process will die in 300s
debug "Starting lag_keepalive to send LACPDUs ..."
timeout 300 python ${LAG_KEEPALIVE_SCRIPT} &
# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s)
sleep 60
if [ -x ${LOG_SSD_HEALTH} ]; then
debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..."
${LOG_SSD_HEALTH}
Expand Down
102 changes: 102 additions & 0 deletions scripts/lag_keepalive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3

from scapy.config import conf
conf.ipv6_enabled = False
from scapy.all import sendp, sniff
from swsscommon.swsscommon import ConfigDBConnector
import time, threading, traceback
import syslog

SYSLOG_ID = 'lag_keepalive'


def log_info(msg):
syslog.openlog(SYSLOG_ID)
syslog.syslog(syslog.LOG_INFO, msg)
syslog.closelog()


def log_error(msg):
syslog.openlog(SYSLOG_ID)
syslog.syslog(syslog.LOG_ERR, msg)
syslog.closelog()


def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet):
sniffed_packet = sniff(iface=lag_member,
filter="ether proto 0x8809 and ether src {}".format(device_mac),
count=1, timeout=30)
lag_member_to_packet[lag_member] = sniffed_packet


def get_lacpdu_per_lag_member():
appDB = ConfigDBConnector()
appDB.db_connect('APPL_DB')
appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE')
configDB = ConfigDBConnector()
configDB.db_connect('CONFIG_DB')
device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac")
hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku")
active_lag_members = list()
lag_member_to_packet = dict()
sniffer_threads = list()
for lag_entry in appDB_lag_info:
lag_name = str(lag_entry[0])
oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status")
if oper_status == "up":
# only apply the workaround for active lags
lag_member = str(lag_entry[1])
active_lag_members.append(lag_member)
# use threading to capture lacpdus from several lag members simultaneously
sniffer_thread = threading.Thread(target=sniff_lacpdu,
args=(device_mac, lag_member, lag_member_to_packet))
sniffer_thread.start()
sniffer_threads.append(sniffer_thread)

# sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s
for sniffer in sniffer_threads:
sniffer.join(timeout=30)

return active_lag_members, lag_member_to_packet


def lag_keepalive(lag_member_to_packet):
while True:
for lag_member, packet in lag_member_to_packet.items():
try:
sendp(packet, iface=lag_member, verbose=False)
except Exception:
# log failure and continue to send lacpdu
traceback_msg = traceback.format_exc()
log_error("Failed to send LACPDU packet from interface {} with error: {}".format(
lag_member, traceback_msg))
continue
log_info("sent LACPDU packets via {}".format(lag_member_to_packet.keys()))
time.sleep(1)


def main():
while True:
try:
active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member()
if len(active_lag_members) != len(lag_member_to_packet.keys()):
log_error("Failed to capture LACPDU packets for some lag members. " +\
"Active lag members: {}. LACPDUs captured for: {}".format(
active_lag_members, lag_member_to_packet.keys()))

log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys()))
except Exception:
traceback_msg = traceback.format_exc()
log_error("Failed to get LAG members and LACPDUs with error: {}".format(
traceback_msg))
# keep attempting until sniffed packets are ready
continue
# if no exceptions are thrown, break from loop as LACPDUs are ready to be sent
break

if lag_member_to_packet:
# start an infinite loop to keep sending lacpdus from lag member ports
lag_keepalive(lag_member_to_packet)

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
'scripts/intfutil',
'scripts/intfstat',
'scripts/ipintutil',
'scripts/lag_keepalive.py',
'scripts/lldpshow',
'scripts/log_ssd_health',
'scripts/mellanox_buffer_migrator.py',
Expand Down

0 comments on commit 984983e

Please sign in to comment.