From f9810cc48f7379a0c02c76d8fc897d3ffc9d6ad8 Mon Sep 17 00:00:00 2001 From: just-mitch <68168980+just-mitch@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:12:00 -0500 Subject: [PATCH] feat: persistence in helm chart for validator and boot node (#10543) chore: give validators/boot-nodes 100Gi in network configs feat: allow metrics to be instantly flushed chore: flush archiver metrics on startup feat: allow making range queries to prometheus in tests --- .../aztec-network/templates/boot-node.yaml | 13 ++++++++ .../aztec-network/templates/validator.yaml | 14 ++++++++- spartan/aztec-network/values.yaml | 6 +++- .../values/4-validators-with-metrics.yaml | 2 ++ spartan/aztec-network/values/exp-1.yaml | 2 ++ spartan/aztec-network/values/rc-1.yaml | 2 ++ spartan/aztec-network/values/rc-2.yaml | 2 ++ .../archiver/src/archiver/archiver.ts | 2 +- .../archiver/src/archiver/instrumentation.ts | 19 +++++++++++- .../end-to-end/scripts/network_test.sh | 2 +- .../src/quality_of_service/alert_checker.ts | 30 ++++++++++++++++--- .../src/spartan/gating-passive.test.ts | 22 +++++++++----- yarn-project/end-to-end/src/spartan/utils.ts | 2 +- yarn-project/telemetry-client/src/metrics.ts | 1 + yarn-project/telemetry-client/src/noop.ts | 4 +++ yarn-project/telemetry-client/src/otel.ts | 8 +++++ .../telemetry-client/src/telemetry.ts | 7 ++++- 17 files changed, 120 insertions(+), 18 deletions(-) diff --git a/spartan/aztec-network/templates/boot-node.yaml b/spartan/aztec-network/templates/boot-node.yaml index 3a5d2103f9a..93a146a8c2e 100644 --- a/spartan/aztec-network/templates/boot-node.yaml +++ b/spartan/aztec-network/templates/boot-node.yaml @@ -11,6 +11,14 @@ spec: matchLabels: {{- include "aztec-network.selectorLabels" . | nindent 6 }} app: boot-node + volumeClaimTemplates: + - metadata: + name: boot-node-data + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: {{ .Values.bootNode.storageSize }} template: metadata: labels: @@ -119,6 +127,8 @@ spec: mountPath: /shared/p2p - name: config mountPath: /shared/config + - name: boot-node-data + mountPath: {{ .Values.bootNode.dataDir }} {{- if .Values.bootNode.deployContracts }} - name: scripts-output mountPath: /shared/contracts @@ -182,6 +192,9 @@ spec: emptyDir: {} - name: config emptyDir: {} + - name: boot-node-data + persistentVolumeClaim: + claimName: boot-node-data {{- if .Values.bootNode.deployContracts }} - name: scripts configMap: diff --git a/spartan/aztec-network/templates/validator.yaml b/spartan/aztec-network/templates/validator.yaml index 1faa6823076..b48acccee4d 100644 --- a/spartan/aztec-network/templates/validator.yaml +++ b/spartan/aztec-network/templates/validator.yaml @@ -12,6 +12,14 @@ spec: matchLabels: {{- include "aztec-network.selectorLabels" . | nindent 6 }} app: validator + volumeClaimTemplates: + - metadata: + name: validator-data + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: {{ .Values.validator.storageSize }} template: metadata: labels: @@ -53,7 +61,6 @@ spec: {{- end }} if [ "{{ .Values.validator.dynamicBootNode }}" = "true" ]; then - # Get the list of pod IPs for the validator service echo "{{ include "aztec-network.pxeUrl" . }}" > /shared/pxe/pxe_url else until curl --silent --head --fail "${BOOT_NODE_HOST}/status" > /dev/null; do @@ -136,6 +143,8 @@ spec: mountPath: /shared/p2p - name: config mountPath: /shared/config + - name: validator-data + mountPath: {{ .Values.validator.dataDir }} env: - name: POD_IP valueFrom: @@ -197,6 +206,9 @@ spec: emptyDir: {} - name: config emptyDir: {} + - name: validator-data + persistentVolumeClaim: + claimName: validator-data --- # If this is not a public network, create a headless service for StatefulSet DNS entries {{ if not .Values.network.public }} diff --git a/spartan/aztec-network/values.yaml b/spartan/aztec-network/values.yaml index a9596657a6f..807aa7fb8b6 100644 --- a/spartan/aztec-network/values.yaml +++ b/spartan/aztec-network/values.yaml @@ -71,7 +71,9 @@ bootNode: outboxAddress: "" feeJuiceAddress: "" feeJuicePortalAddress: "" - storage: "8Gi" + stakingAssetAddress: "" + storageSize: "1Gi" + dataDir: "/data" validator: # If true, the validator will use its peers to serve as the boot node. @@ -108,6 +110,8 @@ validator: requests: memory: "2Gi" cpu: "200m" + storageSize: "1Gi" + dataDir: "/data" proverNode: externalHost: "" diff --git a/spartan/aztec-network/values/4-validators-with-metrics.yaml b/spartan/aztec-network/values/4-validators-with-metrics.yaml index 6f59aa62708..74a7f6df2a1 100644 --- a/spartan/aztec-network/values/4-validators-with-metrics.yaml +++ b/spartan/aztec-network/values/4-validators-with-metrics.yaml @@ -21,6 +21,8 @@ validator: - 0x90F79bf6EB2c4f870365E785982E1f101E93b906 validator: disabled: false + sequencer: + enforceTimeTable: false bootNode: validator: diff --git a/spartan/aztec-network/values/exp-1.yaml b/spartan/aztec-network/values/exp-1.yaml index f0ff0edcc53..3e32951395b 100644 --- a/spartan/aztec-network/values/exp-1.yaml +++ b/spartan/aztec-network/values/exp-1.yaml @@ -14,6 +14,7 @@ images: pullPolicy: Always validator: + storageSize: "100Gi" replicas: 48 validatorKeys: - 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 @@ -124,6 +125,7 @@ validator: bootNode: peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0 + storageSize: "100Gi" validator: disabled: true diff --git a/spartan/aztec-network/values/rc-1.yaml b/spartan/aztec-network/values/rc-1.yaml index 625c58fd2aa..2b8deabe35b 100644 --- a/spartan/aztec-network/values/rc-1.yaml +++ b/spartan/aztec-network/values/rc-1.yaml @@ -15,6 +15,7 @@ telemetry: otelCollectorEndpoint: http://35.197.100.168:4318 validator: + storageSize: "100Gi" replicas: 48 validatorKeys: - 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 @@ -125,6 +126,7 @@ bootNode: peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0 validator: disabled: true + storageSize: "100Gi" proverAgent: replicas: 8 diff --git a/spartan/aztec-network/values/rc-2.yaml b/spartan/aztec-network/values/rc-2.yaml index e059fb4ce9a..7479723dd64 100644 --- a/spartan/aztec-network/values/rc-2.yaml +++ b/spartan/aztec-network/values/rc-2.yaml @@ -16,6 +16,7 @@ telemetry: validator: replicas: 48 + storageSize: "100Gi" validatorKeys: - 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 - 0x59c6995e998f97a5a0044966f0945389dc9e86dae88c7a8412f4603b6b78690d @@ -122,6 +123,7 @@ validator: disabled: false bootNode: + storageSize: "100Gi" peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0 validator: disabled: true diff --git a/yarn-project/archiver/src/archiver/archiver.ts b/yarn-project/archiver/src/archiver/archiver.ts index b897f117bc8..8a8ca018575 100644 --- a/yarn-project/archiver/src/archiver/archiver.ts +++ b/yarn-project/archiver/src/archiver/archiver.ts @@ -174,7 +174,7 @@ export class Archiver implements ArchiveSource { pollingIntervalMs: config.archiverPollingIntervalMS ?? 10_000, batchSize: config.archiverBatchSize ?? 100, }, - new ArchiverInstrumentation(telemetry, () => archiverStore.estimateSize()), + await ArchiverInstrumentation.new(telemetry, () => archiverStore.estimateSize()), { l1StartBlock, l1GenesisTime, epochDuration, slotDuration, ethereumSlotDuration }, ); await archiver.start(blockUntilSynced); diff --git a/yarn-project/archiver/src/archiver/instrumentation.ts b/yarn-project/archiver/src/archiver/instrumentation.ts index 11cc378fdcf..1f86770606b 100644 --- a/yarn-project/archiver/src/archiver/instrumentation.ts +++ b/yarn-project/archiver/src/archiver/instrumentation.ts @@ -18,13 +18,14 @@ export class ArchiverInstrumentation { private blockHeight: Gauge; private blockSize: Gauge; private syncDuration: Histogram; + private l1BlocksSynced: UpDownCounter; private proofsSubmittedDelay: Histogram; private proofsSubmittedCount: UpDownCounter; private dbMetrics: LmdbMetrics; private log = createLogger('archiver:instrumentation'); - constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) { + private constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) { const meter = telemetry.getMeter('Archiver'); this.blockHeight = meter.createGauge(Metrics.ARCHIVER_BLOCK_HEIGHT, { description: 'The height of the latest block processed by the archiver', @@ -59,6 +60,11 @@ export class ArchiverInstrumentation { }, }); + this.l1BlocksSynced = meter.createUpDownCounter(Metrics.ARCHIVER_L1_BLOCKS_SYNCED, { + description: 'Number of blocks synced from L1', + valueType: ValueType.INT, + }); + this.dbMetrics = new LmdbMetrics( meter, { @@ -77,6 +83,16 @@ export class ArchiverInstrumentation { ); } + public static async new(telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) { + const instance = new ArchiverInstrumentation(telemetry, lmdbStats); + + instance.l1BlocksSynced.add(0); + + await instance.telemetry.flush(); + + return instance; + } + public isEnabled(): boolean { return this.telemetry.isEnabled(); } @@ -84,6 +100,7 @@ export class ArchiverInstrumentation { public processNewBlocks(syncTimePerBlock: number, blocks: L2Block[]) { this.syncDuration.record(Math.ceil(syncTimePerBlock)); this.blockHeight.record(Math.max(...blocks.map(b => b.number))); + this.l1BlocksSynced.add(blocks.length); for (const block of blocks) { this.blockSize.record(block.body.txEffects.length); } diff --git a/yarn-project/end-to-end/scripts/network_test.sh b/yarn-project/end-to-end/scripts/network_test.sh index b9083df3d7e..7c2089bb494 100755 --- a/yarn-project/end-to-end/scripts/network_test.sh +++ b/yarn-project/end-to-end/scripts/network_test.sh @@ -180,5 +180,5 @@ docker run --rm --network=host \ -e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \ -e DEBUG=${DEBUG:-""} \ -e LOG_JSON=1 \ - -e LOG_LEVEL=verbose \ + -e LOG_LEVEL=${LOG_LEVEL:-"verbose"} \ aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG $TEST diff --git a/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts b/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts index d6006cc082d..2a6ced89ac6 100644 --- a/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts +++ b/yarn-project/end-to-end/src/quality_of_service/alert_checker.ts @@ -6,6 +6,9 @@ import * as yaml from 'js-yaml'; export interface AlertConfig { alert: string; expr: string; + start?: number; + end?: number; + step?: number; for: string; labels: Record; annotations: Record; @@ -18,7 +21,7 @@ export interface AlertCheckerConfig { // This config is good if you're running the otel-lgtm stack locally const DEFAULT_CONFIG: AlertCheckerConfig = { - grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query', + grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1', grafanaCredentials: 'admin:admin', }; @@ -41,10 +44,29 @@ export class AlertChecker { return data.alerts; } - private async queryGrafana(expr: string): Promise { + private async queryGrafana({ expr, start, end, step }: AlertConfig): Promise { const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64'); - const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, { + let query = `query=${encodeURIComponent(expr)}`; + let action = 'query'; + + if (start) { + action = 'query_range'; + query += `&start=${start}`; + } + + if (end) { + query += `&end=${end}`; + } + + if (step) { + query += `&step=${step}`; + } + + const urlString = `${this.config.grafanaEndpoint}/${action}?${query}`; + this.logger.debug(`Querying Grafana: ${urlString}`); + + const response = await fetch(urlString, { headers: { Authorization: `Basic ${credentials}`, }, @@ -65,7 +87,7 @@ export class AlertChecker { for (const alert of alerts) { this.logger.info(`Checking alert: ${JSON.stringify(alert)}`); - const metricValue = await this.queryGrafana(alert.expr); + const metricValue = await this.queryGrafana(alert); this.logger.info(`Metric value: ${metricValue}`); if (metricValue > 0) { this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`); diff --git a/yarn-project/end-to-end/src/spartan/gating-passive.test.ts b/yarn-project/end-to-end/src/spartan/gating-passive.test.ts index 9f9438d1f1d..99d5b06dc2c 100644 --- a/yarn-project/end-to-end/src/spartan/gating-passive.test.ts +++ b/yarn-project/end-to-end/src/spartan/gating-passive.test.ts @@ -26,6 +26,14 @@ const qosAlerts: AlertConfig[] = [ for: '10m', annotations: {}, }, + { + // Checks that we are not syncing from scratch each time we reboot + alert: 'ArchiverL1BlocksSynced', + expr: 'rate(aztec_archiver_l1_blocks_synced[1m]) > 0.5', + labels: { severity: 'error' }, + for: '10m', + annotations: {}, + }, ]; const config = setupEnvironment(process.env); @@ -52,6 +60,12 @@ describe('a test that passively observes the network in the presence of network const MAX_MISSED_SLOT_PERCENT = 0.6; afterAll(async () => { + await startPortForward({ + resource: `svc/metrics-grafana`, + namespace: 'metrics', + containerPort: config.CONTAINER_METRICS_PORT, + hostPort: config.HOST_METRICS_PORT, + }); await runAlertCheck(config, qosAlerts, debugLogger); }); @@ -69,12 +83,6 @@ describe('a test that passively observes the network in the presence of network hostPort: HOST_ETHEREUM_PORT, }); - await startPortForward({ - resource: `svc/metrics-grafana`, - namespace: 'metrics', - containerPort: config.CONTAINER_METRICS_PORT, - hostPort: config.HOST_METRICS_PORT, - }); const client = await createCompatibleClient(PXE_URL, debugLogger); const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST); const rollupCheatCodes = new RollupCheatCodes( @@ -93,7 +101,7 @@ describe('a test that passively observes the network in the presence of network // note, don't forget that normally an epoch doesn't need epochDuration worth of blocks, // but here we do double duty: // we want a handful of blocks, and we want to pass the epoch boundary - await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 5, debugLogger); + await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 6, debugLogger); let deploymentOutput: string = ''; deploymentOutput = await applyNetworkShaping({ diff --git a/yarn-project/end-to-end/src/spartan/utils.ts b/yarn-project/end-to-end/src/spartan/utils.ts index 82a4c56d610..9874929f821 100644 --- a/yarn-project/end-to-end/src/spartan/utils.ts +++ b/yarn-project/end-to-end/src/spartan/utils.ts @@ -23,7 +23,7 @@ const k8sLocalConfigSchema = z.object({ HOST_METRICS_PORT: z.coerce.number().min(1, 'HOST_METRICS_PORT env variable must be set'), CONTAINER_METRICS_PORT: z.coerce.number().default(80), GRAFANA_PASSWORD: z.string().min(1, 'GRAFANA_PASSWORD env variable must be set'), - METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1/query'), + METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1'), SPARTAN_DIR: z.string().min(1, 'SPARTAN_DIR env variable must be set'), K8S: z.literal('local'), }); diff --git a/yarn-project/telemetry-client/src/metrics.ts b/yarn-project/telemetry-client/src/metrics.ts index 22b83427ec5..84fe768da50 100644 --- a/yarn-project/telemetry-client/src/metrics.ts +++ b/yarn-project/telemetry-client/src/metrics.ts @@ -36,6 +36,7 @@ export const MEMPOOL_PROVER_QUOTE_COUNT = 'aztec.mempool.prover_quote_count'; export const MEMPOOL_PROVER_QUOTE_SIZE = 'aztec.mempool.prover_quote_size'; export const ARCHIVER_SYNC_DURATION = 'aztec.archiver.sync_duration'; +export const ARCHIVER_L1_BLOCKS_SYNCED = 'aztec.archiver.l1_blocks_synced'; export const ARCHIVER_BLOCK_HEIGHT = 'aztec.archiver.block_height'; export const ARCHIVER_BLOCK_SIZE = 'aztec.archiver.block_size'; export const ARCHIVER_ROLLUP_PROOF_DELAY = 'aztec.archiver.rollup_proof_delay'; diff --git a/yarn-project/telemetry-client/src/noop.ts b/yarn-project/telemetry-client/src/noop.ts index 11872833ba6..120c8a6e3c5 100644 --- a/yarn-project/telemetry-client/src/noop.ts +++ b/yarn-project/telemetry-client/src/noop.ts @@ -15,6 +15,10 @@ export class NoopTelemetryClient implements TelemetryClient { return Promise.resolve(); } + flush(): Promise { + return Promise.resolve(); + } + isEnabled() { return false; } diff --git a/yarn-project/telemetry-client/src/otel.ts b/yarn-project/telemetry-client/src/otel.ts index e5b8be2f224..5ba3f5bcec9 100644 --- a/yarn-project/telemetry-client/src/otel.ts +++ b/yarn-project/telemetry-client/src/otel.ts @@ -95,6 +95,14 @@ export class OpenTelemetryClient implements TelemetryClient { return true; } + public async flush() { + await Promise.all([ + this.meterProvider.forceFlush(), + this.loggerProvider.forceFlush(), + this.traceProvider instanceof NodeTracerProvider ? this.traceProvider.forceFlush() : Promise.resolve(), + ]); + } + public async stop() { const flushAndShutdown = async (provider: { forceFlush: () => Promise; shutdown: () => Promise }) => { await provider.forceFlush(); diff --git a/yarn-project/telemetry-client/src/telemetry.ts b/yarn-project/telemetry-client/src/telemetry.ts index 60e55b8b1c6..9fb8497fb21 100644 --- a/yarn-project/telemetry-client/src/telemetry.ts +++ b/yarn-project/telemetry-client/src/telemetry.ts @@ -18,7 +18,7 @@ import { import * as Attributes from './attributes.js'; import * as Metrics from './metrics.js'; -export { ValueType, Span } from '@opentelemetry/api'; +export { Span, ValueType } from '@opentelemetry/api'; type ValuesOf = T extends Record ? U : never; @@ -115,6 +115,11 @@ export interface TelemetryClient { * Stops the telemetry client. */ stop(): Promise; + + /** + * Flushes the telemetry client. + */ + flush(): Promise; } /** Objects that adhere to this interface can use @trackSpan */