From eb7945fade50d53f94f266b2f27b7181bf3e665d Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Wed, 24 Mar 2021 09:43:02 -0700 Subject: [PATCH] Warmboot script improvements - timeout exec, disable swss autorestart, remove trap (#1495) Below changes are made to warmboot/fastboot script: 1. Add timeout to make sure syncd shutdown request will return in time. 5s 2. Disable trap handler after +e. 3. Make sure that syncd pre-shutdown wait won't take more than 60 seconds. 4. Make sure subsequent docker exec won't stuck for long time 5. Before shutdown, check docker exec on the relevant docker containers still works. --- scripts/fast-reboot | 52 +++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index 92648bd2072a..c782265e6b71 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -47,7 +47,7 @@ function error() function debug() { if [[ x"${VERBOSE}" == x"yes" ]]; then - echo `date` $@ + echo $(date) $@ fi logger "$@" } @@ -128,10 +128,10 @@ function clear_warm_boot() { common_clear - result=`timeout 10s config warm_restart disable; if [[ $? == 124 ]]; then echo timeout; else echo "code ($?)"; fi` || /bin/true + result=$(timeout 10s config warm_restart disable; res=$?; if [[ $res == 124 ]]; then echo timeout; else echo "code ($res)"; fi) || /bin/true debug "Cancel warm-reboot: ${result}" - TIMESTAMP=`date +%Y%m%d-%H%M%S` + TIMESTAMP=$(date +%Y%m%d-%H%M%S) if [[ -f ${WARM_DIR}/${REDIS_FILE} ]]; then mv -f ${WARM_DIR}/${REDIS_FILE} ${WARM_DIR}/${REDIS_FILE}.${TIMESTAMP} || /bin/true fi @@ -155,7 +155,7 @@ function initialize_pre_shutdown() { debug "Initialize pre-shutdown ..." TABLE="WARM_RESTART_TABLE|warm-shutdown" - RESTORE_COUNT=`sonic-db-cli STATE_DB hget "${TABLE}" restore_count` + RESTORE_COUNT=$(sonic-db-cli STATE_DB hget "${TABLE}" restore_count) if [[ -z "$RESTORE_COUNT" ]]; then sonic-db-cli STATE_DB hset "${TABLE}" "restore_count" "0" > /dev/null fi @@ -165,9 +165,10 @@ function initialize_pre_shutdown() function request_pre_shutdown() { debug "Requesting pre-shutdown ..." - /usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || { + STATE=$(timeout 5s docker exec syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null; if [[ $? == 124 ]]; then echo "timed out"; fi) + if [[ x"${STATE}" == x"timed out" ]]; then error "Failed to request pre-shutdown" - } + fi } function recover_issu_bank_file() @@ -205,18 +206,18 @@ function wait_for_pre_shutdown_complete_or_fail() STATE="requesting" declare -i waitcount declare -i retrycount - waitcount=0 retrycount=0 + start_time=$SECONDS + elapsed_time=$(($SECONDS - $start_time)) # Wait up to 60 seconds for pre-shutdown to complete - while [[ ${waitcount} -lt 600 ]]; do + while [[ ${elapsed_time} -lt 60 ]]; do # timeout doesn't work with -i option of "docker exec". Therefore we have # to invoke docker exec directly below. - STATE=`timeout 5s sonic-db-cli STATE_DB hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi` + STATE=$(timeout 5s sonic-db-cli STATE_DB hget "${TABLE}" state; if [[ $? == 124 ]]; then echo "timed out"; fi) if [[ x"${STATE}" == x"timed out" ]]; then - waitcount+=50 retrycount+=1 - debug "Timed out getting pre-shutdown state (${waitcount}) retry count ${retrycount} ..." + debug "Timed out getting pre-shutdown state, retry count ${retrycount} ..." if [[ retrycount -gt 2 ]]; then break fi @@ -224,14 +225,14 @@ function wait_for_pre_shutdown_complete_or_fail() break else sleep 0.1 - waitcount+=1 fi + elapsed_time=$(($SECONDS - $start_time)) done if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then - debug "Syncd pre-shutdown failed: ${STATE} ..." + debug "Syncd pre-shutdown failed, state: ${STATE} ..." else - debug "Pre-shutdown succeeded ..." + debug "Pre-shutdown succeeded, state: ${STATE} ..." fi } @@ -259,7 +260,10 @@ function backup_database() # Dump redis content to a file 'dump.rdb' in warmboot directory docker cp database:/var/lib/$target_db_inst/$REDIS_FILE $WARM_DIR - docker exec -i database rm /var/lib/$target_db_inst/$REDIS_FILE + STATE=$(timeout 5s docker exec database rm /var/lib/$target_db_inst/$REDIS_FILE; if [[ $? == 124 ]]; then echo "timed out"; fi) + if [[ x"${STATE}" == x"timed out" ]]; then + error "Timed out during attempting to remove Redis dump file from database container" + fi } function setup_control_plane_assistant() @@ -309,10 +313,23 @@ function setup_reboot_variables() INITRD=$(echo $KERNEL_IMAGE | sed 's/vmlinuz/initrd.img/g') } +function check_docker_exec() +{ + containers="radv bgp lldp swss database teamd syncd" + for container in $containers; do + STATE=$(timeout 1s docker exec $container echo "success"; if [[ $? == 124 ]]; then echo "timed out"; fi) + if [[ x"${STATE}" == x"timed out" ]]; then + error "Docker exec on $container timedout" + exit "${EXIT_FAILURE}" + fi + done +} + function reboot_pre_check() { + check_docker_exec # Make sure that the file system is normal: read-write able - filename="/host/test-`date +%Y%m%d-%H%M%S`" + filename="/host/test-$(date +%Y%m%d-%H%M%S)" if [[ ! -f ${filename} ]]; then touch ${filename} fi @@ -541,6 +558,9 @@ fi # service will go down and we cannot recover from it. set +e +# disable trap-handlers which were set before +trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM + if [ -x ${LOG_SSD_HEALTH} ]; then debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..." ${LOG_SSD_HEALTH}