From f2652874e6a3354c98ae85ee154e2df3f1c8dcbd Mon Sep 17 00:00:00 2001 From: Spoorthi Satheesha <9302666+spoo-bar@users.noreply.github.com> Date: Thu, 23 Sep 2021 21:01:10 +0530 Subject: [PATCH] feat: adding `max-retries` limit to preupgrade retry limit (#10137) ## Description Ref: #9973, #10056 * Skipping backup when skipping upgrade at specified height * adding env var `PREUPGRADE_MAX_RETRIES` which specifies the limit on how many times to retry preupgrade on error code `31` --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [ ] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [ ] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [ ] included comments for [documenting Go code](https://blog.golang.org/godoc) - [ ] updated the relevant documentation or specification - [ ] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable) --- cosmovisor/CHANGELOG.md | 3 +- cosmovisor/README.md | 2 ++ cosmovisor/args.go | 20 +++++++---- cosmovisor/process.go | 70 ++++++++++++++++++++++++-------------- cosmovisor/process_test.go | 2 +- 5 files changed, 64 insertions(+), 33 deletions(-) diff --git a/cosmovisor/CHANGELOG.md b/cosmovisor/CHANGELOG.md index 51cce4b77a29..6b5e609ea4a0 100644 --- a/cosmovisor/CHANGELOG.md +++ b/cosmovisor/CHANGELOG.md @@ -41,7 +41,8 @@ Ref: https://keepachangelog.com/en/1.0.0/ + [\#8590](https://github.com/cosmos/cosmos-sdk/pull/8590) File watcher for cosmovisor. Instead of parsing logs from stdin and stderr, we watch the `/data/upgrade-info.json` file updates using polling mechanism. + [\#10128](https://github.com/cosmos/cosmos-sdk/pull/10128) Change default value of `DAEMON_RESTART_AFTER_UPGRADE` to `true`. -+ [\#9999](https://github.com/cosmos/cosmos-sdk/issues/9999) Added `version` command that returns the cosmovisor version and the application version. ++ [\#9999](https://github.com/cosmos/cosmos-sdk/pull/10103) Added `version` command that returns the cosmovisor version and the application version. ++ [\#9973](https://github.com/cosmos/cosmos-sdk/pull/10056) Added support for pre-upgrade command in Cosmovisor to be called before the binary is upgraded. Added new environmental variable `DAEMON_PREUPGRADE_MAX_RETRIES` that holds the maximum number of times to reattempt pre-upgrade before failing. ### Improvements diff --git a/cosmovisor/README.md b/cosmovisor/README.md index ce6193343bd1..1d3eb2b00f7e 100644 --- a/cosmovisor/README.md +++ b/cosmovisor/README.md @@ -43,6 +43,8 @@ All arguments passed to `cosmovisor` will be passed to the application binary (a * `DAEMON_RESTART_AFTER_UPGRADE` (*optional*, default = `true`), if `true`, restarts the subprocess with the same command-line arguments and flags (but with the new binary) after a successful upgrade. Otherwise (`false`), `cosmovisor` stops running after an upgrade and requires the system administrator to manually restart it. Note restart is only after the upgrade and does not auto-restart the subprocess after an error occurs. * `DAEMON_POLL_INTERVAL` is the interval length in milliseconds for polling the upgrade plan file. Default: 300. * `UNSAFE_SKIP_BACKUP` (defaults to `false`), if set to `false`, backs up the data before trying the upgrade. Otherwise (`true`), upgrades directly without performing a backup. The default value of false is useful and recommended in case of failures and when a backup needed to rollback. We recommend using the default backup option `UNSAFE_SKIP_BACKUP=false`. +* `DAEMON_PREUPGRADE_MAX_RETRIES` (defaults to `0`). The maximum number of times to call `pre-upgrade` in the application after exit status of `31`. After the maximum number of retries, cosmovisor fails the upgrade. + ### Folder Layout diff --git a/cosmovisor/args.go b/cosmovisor/args.go index 192b60ce53a5..6e96e5ab2387 100644 --- a/cosmovisor/args.go +++ b/cosmovisor/args.go @@ -15,12 +15,13 @@ import ( // environment variable names const ( - envHome = "DAEMON_HOME" - envName = "DAEMON_NAME" - envDownloadBin = "DAEMON_ALLOW_DOWNLOAD_BINARIES" - envRestartUpgrade = "DAEMON_RESTART_AFTER_UPGRADE" - envSkipBackup = "UNSAFE_SKIP_BACKUP" - envInterval = "DAEMON_POLL_INTERVAL" + envHome = "DAEMON_HOME" + envName = "DAEMON_NAME" + envDownloadBin = "DAEMON_ALLOW_DOWNLOAD_BINARIES" + envRestartUpgrade = "DAEMON_RESTART_AFTER_UPGRADE" + envSkipBackup = "UNSAFE_SKIP_BACKUP" + envInterval = "DAEMON_POLL_INTERVAL" + envPreupgradeMaxRetries = "DAEMON_PREUPGRADE_MAX_RETRIES" ) const ( @@ -42,6 +43,7 @@ type Config struct { RestartAfterUpgrade bool PollInterval time.Duration UnsafeSkipBackup bool + PreupgradeMaxRetries int // currently running upgrade currentUpgrade UpgradeInfo @@ -146,6 +148,12 @@ func GetConfigFromEnv() (*Config, error) { if err := cfg.validate(); err != nil { return nil, err } + + envPreupgradeMaxRetriesVal := os.Getenv(envPreupgradeMaxRetries) + if cfg.PreupgradeMaxRetries, err = strconv.Atoi(envPreupgradeMaxRetriesVal); err != nil && envPreupgradeMaxRetriesVal != "" { + return nil, fmt.Errorf("%s could not be parsed to int: %w", envPreupgradeMaxRetries, err) + } + return cfg, nil } diff --git a/cosmovisor/process.go b/cosmovisor/process.go index bc7b2e61337b..e68255c038a3 100644 --- a/cosmovisor/process.go +++ b/cosmovisor/process.go @@ -61,13 +61,13 @@ func (l Launcher) Run(args []string, stdout, stderr io.Writer) (bool, error) { if err != nil || !needsUpdate { return false, err } - if err := doBackup(l.cfg); err != nil { - return false, err - } - if !SkipUpgrade(args, l.fw.currentInfo) { - err = doPreUpgrade(l.cfg) - if err != nil { + if !IsSkipUpgradeHeight(args, l.fw.currentInfo) { + if err := doBackup(l.cfg); err != nil { + return false, err + } + + if err = doPreUpgrade(l.cfg); err != nil { return false, err } } @@ -152,32 +152,52 @@ func doBackup(cfg *Config) error { return nil } -// doPreUpgrade runs the pre-upgrade command defined by the application +// doPreUpgrade runs the pre-upgrade command defined by the application and handles respective error codes +// cfg contains the cosmovisor config from env var func doPreUpgrade(cfg *Config) error { - bin, err := cfg.CurrentBin() - preUpgradeCmd := exec.Command(bin, "pre-upgrade") + counter := 0 + for { + if counter > cfg.PreupgradeMaxRetries { + return fmt.Errorf("pre-upgrade command failed. reached max attempt of retries - %d", cfg.PreupgradeMaxRetries) + } - _, err = preUpgradeCmd.Output() + err := executePreUpgradeCmd(cfg) + counter += 1 - if err != nil { - if err.(*exec.ExitError).ProcessState.ExitCode() == 1 { - fmt.Println("pre-upgrade command does not exist. continuing the upgrade.") - return nil - } - if err.(*exec.ExitError).ProcessState.ExitCode() == 30 { - return fmt.Errorf("pre-upgrade command failed : %w", err) - } - if err.(*exec.ExitError).ProcessState.ExitCode() == 31 { - fmt.Println("pre-upgrade command failed. retrying.") - return doPreUpgrade(cfg) + if err != nil { + if err.(*exec.ExitError).ProcessState.ExitCode() == 1 { + fmt.Println("pre-upgrade command does not exist. continuing the upgrade.") + return nil + } + if err.(*exec.ExitError).ProcessState.ExitCode() == 30 { + return fmt.Errorf("pre-upgrade command failed : %w", err) + } + if err.(*exec.ExitError).ProcessState.ExitCode() == 31 { + fmt.Println("pre-upgrade command failed. retrying. attempt:", counter) + fmt.Println(err) + continue + } } + fmt.Println("pre-upgrade successful. continuing the upgrade.") + return nil } - fmt.Println("pre-upgrade successful. continuing the upgrade.") - return nil } -// skipUpgrade checks if pre-upgrade script must be run. If the height in the upgrade plan matches any of the heights provided in --safe-skip-upgrade, the script is not run -func SkipUpgrade(args []string, upgradeInfo UpgradeInfo) bool { +// executePreUpgradeCmd runs the pre-upgrade command defined by the application +// cfg contains the cosmosvisor config from the env vars +func executePreUpgradeCmd(cfg *Config) error { + bin, err := cfg.CurrentBin() + if err != nil { + return err + } + + preUpgradeCmd := exec.Command(bin, "pre-upgrade") + _, err = preUpgradeCmd.Output() + return err +} + +// IsSkipUpgradeHeight checks if pre-upgrade script must be run. If the height in the upgrade plan matches any of the heights provided in --safe-skip-upgrade, the script is not run +func IsSkipUpgradeHeight(args []string, upgradeInfo UpgradeInfo) bool { skipUpgradeHeights := UpgradeSkipHeights(args) for _, h := range skipUpgradeHeights { if h == int(upgradeInfo.Height) { diff --git a/cosmovisor/process_test.go b/cosmovisor/process_test.go index d2e9794fa254..f86ea1eb0b60 100644 --- a/cosmovisor/process_test.go +++ b/cosmovisor/process_test.go @@ -160,7 +160,7 @@ func TestSkipUpgrade(t *testing.T) { for i := range cases { tc := cases[i] require := require.New(t) - h := cosmovisor.SkipUpgrade(tc.args, tc.upgradeInfo) + h := cosmovisor.IsSkipUpgradeHeight(tc.args, tc.upgradeInfo) require.Equal(h, tc.expectRes) } }