Skip to content

Commit

Permalink
Merge pull request #5 from oracle-quickstart/feature/monitoring
Browse files Browse the repository at this point in the history
add monitoring stack
  • Loading branch information
streamnsight authored Mar 29, 2023
2 parents cf0e1f9 + ad8675a commit dbcaaf9
Show file tree
Hide file tree
Showing 20 changed files with 1,529 additions and 22 deletions.
12 changes: 12 additions & 0 deletions add_on_dependency_matrix.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## Copyright © 2023, Oracle and/or its affiliates.
## All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl

# defines trigger to enable specific components based on selection
locals {
enable_flink = var.enable_flink
enable_cert_manager = local.enable_flink || var.enable_cert_manager
enable_cluster_autoscaler = var.np1_enable_autoscaler || var.np2_enable_autoscaler || var.np3_enable_autoscaler
enable_monitoring_stack = var.enable_monitoring_stack
enable_metrics_server = local.enable_cluster_autoscaler || var.enable_metrics_server || local.enable_monitoring_stack
enable_grafana_flink_dashboards = local.enable_monitoring_stack && local.enable_flink
}
6 changes: 3 additions & 3 deletions helm_cert_manager.tf → add_on_helm_cert_manager.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
## https://github.com/jetstack/cert-manager/blob/master/README.md
## https://artifacthub.io/packages/helm/cert-manager/cert-manager

locals {
enable_cert_manager = var.enable_flink ? true : var.enable_cert_manager
}
# locals {
# enable_cert_manager = var.enable_flink ? true : var.enable_cert_manager
# }

resource "helm_release" "cert_manager" {
count = local.enable_cert_manager ? 1 : 0
Expand Down
File renamed without changes.
File renamed without changes.
103 changes: 103 additions & 0 deletions add_on_monitoring_stack.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl.

locals {
deployment_name = "kps"
vars = { "region" = var.region, "tenancy_ocid" = var.tenancy_ocid }
scrape_configs = flatten([for i in fileset("${path.module}/templates", "*.scrapeConfigs.yaml") : file("${path.module}/templates/${i}")])
grafana_datasources = flatten([for i in fileset("${path.module}/templates", "grafana.*.datasource.yaml") : yamldecode(templatefile("${path.module}/templates/${i}", local.vars))])
grafana_dashboards = flatten([for i in fileset("${path.module}/templates", "grafana.*.dashboard.json") : { "name" = i, "label" = split(".", i)[1] }])
grafana_plugins = file("${path.module}/templates/grafana.plugins.yaml")
}

output dash {
value = local.grafana_dashboards
}

resource "random_password" "grafana_password" {
count = local.enable_monitoring_stack ? 1 : 0
length = 20
special = true
override_special = "#$%&@!_+=./;:][{}]"
}

output "grafana_password" {
value = local.enable_monitoring_stack ? random_password.grafana_password[0].result : ""
sensitive = true
}

resource "helm_release" "kube_prometheus_stack" {
count = local.enable_monitoring_stack ? 1 : 0
name = local.deployment_name
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = "monitoring"
version = "45.8.0"
wait = false
create_namespace = true

set {
name = "prometheus.prometheusSpec.additionalScrapeConfigs"
value = join("\n", local.scrape_configs)
}

# set {
# name = "grafana.sidecar.datasources.defaultDatasourceEnabled"
# value = false
# }

set {
name = "grafana.adminPassword"
value = random_password.grafana_password[0].result
}

values = [
yamlencode({ "grafana" = {
"additionalDataSources" = local.grafana_datasources,
"plugins" = yamldecode(local.grafana_plugins)
} })
]

depends_on = [
oci_containerengine_node_pool.oci_oke_node_pool
]
}

resource "kubernetes_config_map_v1" "grafana_dashboards" {
count = local.enable_monitoring_stack ? length(local.grafana_dashboards) : 0

metadata {
name = "${local.deployment_name}-grafana-${local.grafana_dashboards[count.index].label}"
namespace = "monitoring"
labels = {
"grafana_dashboard" = "1"
}
}

data = {
"${local.grafana_dashboards[count.index].name}" = "${file("${path.module}/templates/${local.grafana_dashboards[count.index].name}")}"
}
depends_on = [
helm_release.kube_prometheus_stack
]
}

resource "kubernetes_config_map_v1" "grafana_plugins" {
count = local.enable_monitoring_stack ? 1 : 0

metadata {
name = "${local.deployment_name}-grafana-plugins"
namespace = "monitoring"
labels = {
"grafana_plugin" = "1"
}
}

data = {
"plugins" = local.grafana_plugins
}

depends_on = [
helm_release.kube_prometheus_stack
]
}
6 changes: 3 additions & 3 deletions helm_metrics.tf → add_onn_helm_metrics.tf
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
## Copyright © 2022-2023, Oracle and/or its affiliates.
## All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl

locals {
enable_metrics_server = var.np1_enable_autoscaler || var.np2_enable_autoscaler || var.np3_enable_autoscaler ? true : var.enable_metrics_server
}
# locals {
# enable_metrics_server = var.np1_enable_autoscaler || var.np2_enable_autoscaler || var.np3_enable_autoscaler ? true : var.enable_metrics_server
# }

resource "helm_release" "metrics_server" {
count = local.enable_metrics_server ? 1 : 0
Expand Down
7 changes: 4 additions & 3 deletions datasources.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Copyright © 2022, Oracle and/or its affiliates.
## Copyright © 2022-2023, Oracle and/or its affiliates.
## All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl

data "oci_containerengine_cluster_option" "cluster_options" {
Expand All @@ -12,8 +12,7 @@ data "oci_containerengine_node_pool_option" "oci_oke_node_pool_option" {
# Gets home and current regions
data "oci_identity_tenancy" "tenant_details" {
tenancy_id = var.tenancy_ocid

provider = oci.current_region
provider = oci.current_region
}

data "oci_identity_regions" "home_region" {
Expand Down Expand Up @@ -49,6 +48,7 @@ data "oci_limits_limit_definitions" "limit_def" {
service_name = "compute"
}

# buidl maps of valid shapes for each AD
locals {
availability_map = [for def in data.oci_limits_limit_definitions.limit_def.limit_definitions : def if contains(compact([var.np1_node_shape, var.np2_node_shape, var.np3_node_shape]), def.description)]
limits_definitions = [
Expand All @@ -75,6 +75,7 @@ data "oci_core_shapes" "valid_shapes" {
availability_domain = data.oci_identity_availability_domains.ADs.availability_domains[count.index].name
}

# Deploy ID to uniquely identify this cluster and associated resources.
resource "random_string" "deploy_id" {
length = 4
special = false
Expand Down
10 changes: 1 addition & 9 deletions images.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data "oci_core_image" "np3_image" {
image_id = var.np3_image_id
}

# Identify if an OKE specific image is available for the Compute image selected
locals {
k8s_version = replace(local.kubernetes_version, "v", "")
np1_oke_image = var.node_pool_count >= 1 ? [for option
Expand All @@ -30,12 +31,3 @@ locals {
option if length(regexall("${data.oci_core_image.np3_image[0].display_name}-OKE-${local.k8s_version}", option.source_name)) > 0] : []
np3_oke_image_id = length(local.np3_oke_image) > 0 ? local.np3_oke_image[0].image_id : var.np3_image_id
}

# output "images" {
# value = {
# k8s_version = local.k8s_version
# np1_oke_image = local.np1_oke_image
# np2_oke_image = local.np2_oke_image
# np3_oke_image = local.np3_oke_image
# }
# }
2 changes: 1 addition & 1 deletion outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ output "access_command" {
}

output "flink_demo_job" {
value = "kubectl create -f https://raw.githubusercontent.com/apache/flink-kubernetes-operator/release-1.2/examples/basic.yaml"
value = "kubectl create -f https://raw.githubusercontent.com/apache/flink-kubernetes-operator/release-1.3/examples/basic.yaml"
}

output "flink_ui_port_forward" {
Expand Down
2 changes: 1 addition & 1 deletion policies.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {
}

resource "oci_identity_network_source" "node_pool_network_source" {
provider = oci.home_region
provider = oci.home_region
#Required
compartment_id = var.tenancy_ocid
description = "NSG for ${local.nsg_name} autoscaler"
Expand Down
19 changes: 17 additions & 2 deletions schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ variableGroups:
- enable_flink
- enable_cert_manager
- enable_metrics_server
- enable_monitoring_stack

variables:

Expand Down Expand Up @@ -1069,13 +1070,21 @@ variables:
description: |
Apache Flink will be installed using the Flink Operator.
enable_monitoring_stack:
type: boolean
default: true
title: Deploy Monitoring Stack
description: |
Deploys Prometheus, Grafana and related datasources, plugins and dashboards.
outputGroups:
- title: Access Command
- title: Access
outputs:
- access_command
- flink_demo_job
- flink_ui_port_forward
- flink_ui_access
- grafana_password

outputs:
access_command:
Expand All @@ -1100,4 +1109,10 @@ outputs:
type: link
title: Flink UI
displayText: "Access the Flink UI"
visible: true
visible: true

grafana_password:
type: copyableString
title: Grafana access
displayText: "Grafana password for 'admin' user"
visible:
44 changes: 44 additions & 0 deletions templates/grafana.default.dashboards.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
k8s-cluster:
gnetId: 7249
revision: 1
datasource: Prometheus
k8s-cluster-metrics:
gnetId: 11663
revision: 1
datasource: Prometheus
k8s-cluster-metrics-simple:
gnetId: 6417
revision: 1
datasource: Prometheus
k8s-pods-monitoring:
gnetId: 13498
revision: 1
datasource: Prometheus
k8s-memory:
gnetId: 13421
revision: 1
datasource: Prometheus
k8s-networking:
gnetId: 12658
revision: 1
datasource: Prometheus
k8s-cluster-autoscaler:
gnetId: 3831
revision: 1
datasource: Prometheus
k8s-hpa:
gnetId: 10257
revision: 1
datasource: Prometheus
k8s-pods:
gnetId: 6336
revision: 1
datasource: Prometheus
oci-compute:
gnetId: 13596
revision: 1
datasource: Oracle Cloud Infrastructure Metrics
oci-oke:
gnetId: 13594
revision: 1
datasource: Oracle Cloud Infrastructure Metrics
Loading

0 comments on commit dbcaaf9

Please sign in to comment.