diff --git a/README.md b/README.md index ec8cc55..5a9e85d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## About this project -A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Kubernetes. +A set of Plutono and Perses dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Kubernetes. # Content @@ -25,6 +25,8 @@ kubernetes-operations │ ├── dashboards Plutono dashboards for visualizing key metrics. │ + ├── perses-dashboards Perses dashboards for visualizing key metrics. + │ └── Chart.yaml Helm chart manifest. ``` @@ -37,6 +39,7 @@ The content of the repository can be installed independently or as part of the [ | Key | Type | Default | Description | |-----|------|---------|-------------| | dashboards.create | bool | `true` | Enables ConfigMap resources with dashboards to be created | +| dashboards.persesSelectors | list | `[{"name":"perses.dev/resource","value":"\"true\""}]` | Label selectors for the Perses dashboards to be picked up by Perses. | | dashboards.plutonoSelectors | list | `[{"name":"plutono-dashboard","value":"\"true\""}]` | Label selectors for the Plutono dashboards to be picked up by Plutono. | | global.commonLabels | object | `{}` | Common labels to add to all resources # | | prometheusRules.NodeInMaintenance | object | `{"label":"maintenance_state","value":"in-maintenance"}` | The label value pair that marks a Kubernetes node as 'in maintenance' | diff --git a/README.md.gotmpl b/README.md.gotmpl index cde0aff..fb80169 100644 --- a/README.md.gotmpl +++ b/README.md.gotmpl @@ -4,7 +4,7 @@ ## About this project -A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Kubernetes. +A set of Plutono and Perses dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Kubernetes. # Content @@ -25,6 +25,8 @@ kubernetes-operations │ ├── dashboards Plutono dashboards for visualizing key metrics. │ + ├── perses-dashboards Perses dashboards for visualizing key metrics. + │ └── Chart.yaml Helm chart manifest. ``` diff --git a/charts/kubernetes-operations/Chart.yaml b/charts/kubernetes-operations/Chart.yaml index c2b9575..e72bc37 100644 --- a/charts/kubernetes-operations/Chart.yaml +++ b/charts/kubernetes-operations/Chart.yaml @@ -3,7 +3,7 @@ apiVersion: v2 name: kubernetes-operations -version: 1.2.4 +version: 1.2.5 description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Kubernetes. maintainers: - name: richardtief @@ -12,6 +12,7 @@ keywords: - Helm Chart - Kubernetes operations - Plutono Dashboards + - Perses Dashboards - Prometheus Alerting - Alert Rules - Playbooks diff --git a/charts/kubernetes-operations/perses-dashboards/api-server.json b/charts/kubernetes-operations/perses-dashboards/api-server.json new file mode 100644 index 0000000..9b83c38 --- /dev/null +++ b/charts/kubernetes-operations/perses-dashboards/api-server.json @@ -0,0 +1,589 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "apiserver", + "project": "default" + }, + "spec": { + "display": { + "name": "APIServer" + }, + "panels": { + "0": { + "kind": "Panel", + "spec": { + "display": { + "name": "API Server - Health Status" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "up{job=~\".*apiserver.*\"}", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "1": { + "kind": "Panel", + "spec": { + "display": { + "name": "HTTP Requests by code" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (code) (rate(apiserver_request_total[5m]))", + "seriesNameFormat": " {{ code }}" + } + } + } + } + ] + } + }, + "2": { + "kind": "Panel", + "spec": { + "display": { + "name": "HTTP Requests by verb" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "connectNulls": false, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (verb) (rate(apiserver_request_total[5m]))", + "seriesNameFormat": " {{ verb}}" + } + } + } + } + ] + } + }, + "3": { + "kind": "Panel", + "spec": { + "display": { + "name": "HTTP Requests Latency (99th percentile) by verb" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "format": { + "unit": "milliseconds" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!=\"WATCH\"}[5m])) by (verb, le))", + "seriesNameFormat": "{{ verb }}" + } + } + } + } + ] + } + }, + "4": { + "kind": "Panel", + "spec": { + "display": { + "name": "Top 10 Objects by kind" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "topk(10, max by (resource) (apiserver_storage_objects{job=\"apiserver\"}))", + "seriesNameFormat": "{{ resource }}" + } + } + } + } + ] + } + }, + "5": { + "kind": "Panel", + "spec": { + "display": { + "name": "Errors" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(rate(apiserver_request_total{job=\"apiserver\", code=~\"5..\"}[5m]))\n/ \nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m]))", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "6": { + "kind": "Panel", + "spec": { + "display": { + "name": "Errors by verb" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "list", + "position": "bottom", + "values": [] + }, + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2 + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by(verb) (rate(apiserver_request_total{job=\"apiserver\", code=~\"5..\"}[5m]))\n /\nsum by(verb) (rate(apiserver_request_total{job=\"apiserver\"}[5m]))", + "seriesNameFormat": "{{ verb }}" + } + } + } + } + ] + } + }, + "7": { + "kind": "Panel", + "spec": { + "display": { + "name": "Stacked HTTP Requests" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "list", + "position": "bottom", + "values": [] + }, + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2 + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) by (job, instance)", + "seriesNameFormat": "{{ job }}:{{ instance }}" + } + } + } + } + ] + } + }, + "8": { + "kind": "Panel", + "spec": { + "display": { + "name": "Work Queue" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "list", + "position": "bottom", + "values": [] + }, + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2 + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(rate(workqueue_depth{job=\"apiserver\"}[5m])) by (instance)", + "seriesNameFormat": "{{ instance }}" + } + } + } + } + ] + } + }, + "9": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "list", + "position": "bottom", + "values": [] + }, + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2 + }, + "yAxis": { + "format": { + "unit": "percent-decimal" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "rate(process_cpu_seconds_total{job=\"apiserver\"}[5m])", + "seriesNameFormat": "{{ job }}:{{ instance }}" + } + } + } + } + ] + } + }, + "10": { + "kind": "Panel", + "spec": { + "display": { + "name": "Memory Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "list", + "position": "bottom", + "values": [] + }, + "visual": { + "areaOpacity": 0.25, + "connectNulls": true, + "display": "line", + "lineWidth": 2 + }, + "yAxis": { + "format": { + "unit": "bytes" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "process_resident_memory_bytes{job=\"apiserver\"}", + "seriesNameFormat": "{{ job }}:{{ instance }}" + } + } + } + } + ] + } + } + }, + "layouts": [ + { + "kind": "Grid", + "spec": { + "items": [ + { + "x": 0, + "y": 0, + "width": 24, + "height": 5, + "content": { + "$ref": "#/spec/panels/0" + } + }, + { + "x": 0, + "y": 5, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/1" + } + }, + { + "x": 12, + "y": 5, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/2" + } + }, + { + "x": 0, + "y": 13, + "width": 24, + "height": 8, + "content": { + "$ref": "#/spec/panels/3" + } + }, + { + "x": 0, + "y": 21, + "width": 24, + "height": 10, + "content": { + "$ref": "#/spec/panels/4" + } + }, + { + "x": 0, + "y": 31, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/5" + } + }, + { + "x": 12, + "y": 31, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/6" + } + }, + { + "x": 0, + "y": 39, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/7" + } + }, + { + "x": 12, + "y": 39, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/8" + } + }, + { + "x": 0, + "y": 47, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/9" + } + }, + { + "x": 12, + "y": 47, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/10" + } + } + ] + } + } + ], + "variables": [ + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Prometheus Instance", + "hidden": false + }, + "defaultValue": "kube-monitoring-obs-eu-de-1-prometheus", + "allowAllValue": false, + "allowMultiple": false, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "job", + "matchers": ["prometheus_build_info"] + } + }, + "name": "prometheus-instance" + } + } + ], + "duration": "12h", + "refreshInterval": "1m", + "datasources": {} + } +} diff --git a/charts/kubernetes-operations/perses-dashboards/core-dns.json b/charts/kubernetes-operations/perses-dashboards/core-dns.json new file mode 100644 index 0000000..7635282 --- /dev/null +++ b/charts/kubernetes-operations/perses-dashboards/core-dns.json @@ -0,0 +1,686 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "coredns", + "project": "default" + }, + "spec": { + "display": { + "name": "CoreDNS" + }, + "panels": { + "0": { + "kind": "Panel", + "spec": { + "display": { + "name": "Memory usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "container_memory_working_set_bytes{pod=~\"coredns-.*\", namespace=\"kube-system\"}", + "seriesNameFormat": "{{pod}}" + } + } + } + } + ] + } + }, + "1": { + "kind": "Panel", + "spec": { + "display": { + "name": "Goroutines" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "format": { + "unit": "decimal" + }, + "label": "", + "show": true + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "label_replace((go_goroutines{kubernetes_pod_name=~\"coredns-.*\", kubernetes_namespace=\"kube-system\"} or go_goroutines{pod=~\"coredns-.*\", namespace=\"kube-system\"}), \"pod\", \"$1\", \"kubernetes_pod_name\", \"(.+)\")", + "seriesNameFormat": "{{pod}}" + } + } + } + } + ] + } + }, + "2_0": { + "kind": "Panel", + "spec": { + "display": { + "name": "Requests" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum (rate(coredns_dns_requests_total[5m])) or sum (rate(coredns_dns_request_count_total[5m]))", + "seriesNameFormat": "total" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum (rate(coredns_forward_requests_total[5m])) or sum (rate(coredns_forward_request_count_total[5m]))", + "seriesNameFormat": "forwarded" + } + } + } + } + ] + } + }, + "2_1": { + "kind": "Panel", + "spec": { + "display": { + "name": "Requests per instance" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (pod) (label_replace(rate(coredns_dns_requests_total[5m]), \"pod\", \"$1\", \"kubernetes_pod_name\", \"(.+)\")) or sum by (pod) (label_replace(rate(coredns_dns_request_count_total[5m]), \"pod\", \"$1\", \"kubernetes_pod_name\", \"(.+)\"))", + "seriesNameFormat": "{{pod}}" + } + } + } + } + ] + } + }, + "2_2": { + "kind": "Panel", + "spec": { + "display": { + "name": "Request latency (99th percentile)" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", + "seriesNameFormat": "total" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket[5m])) by (le))", + "seriesNameFormat": "forwarded" + } + } + } + } + ] + } + }, + "2_3": { + "kind": "Panel", + "spec": { + "display": { + "name": "Requests by type" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "requests/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (type) (rate(coredns_dns_requests_total[5m])) or sum by (type) (rate(coredns_dns_request_type_count_total[5m]))", + "seriesNameFormat": "{{type}}" + } + } + } + } + ] + } + }, + "2_4": { + "kind": "Panel", + "spec": { + "display": { + "name": "Respones by rcode" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "ops/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (rcode) (rate(coredns_dns_response_rcode_count_total[5m])) or sum by (rcode) (rate(coredns_dns_responses_total[5m]))", + "seriesNameFormat": "{{rcode}}" + } + } + } + } + ] + } + }, + "2_5": { + "kind": "Panel", + "spec": { + "display": { + "name": "Forwarded respones by rcode" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "ops/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (rcode) (rate(coredns_forward_responses_total[5m])) or sum by (rcode) (rate(coredns_forward_response_rcode_count_total[5m]))", + "seriesNameFormat": "{{rcode}}" + } + } + } + } + ] + } + }, + "3_0": { + "kind": "Panel", + "spec": { + "display": { + "name": "Success Cache Size" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 3000 + }, + { + "color": "semi-dark-red", + "value": 6000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "max(coredns_cache_entries{type=\"success\"})", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "3_1": { + "kind": "Panel", + "spec": { + "display": { + "name": "Denial Cache Size" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "#EAB839", + "value": 3000 + }, + { + "color": "semi-dark-red", + "value": 6000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "max(coredns_cache_entries{type=\"denial\"})", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "3_2": { + "kind": "Panel", + "spec": { + "display": { + "name": "Cache hits/misses" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "ops/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (type) (rate(coredns_cache_hits_total[5m]))", + "seriesNameFormat": "{{type}} hits" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum (rate(coredns_cache_misses_total[5m]))", + "seriesNameFormat": "{{type}} misses" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(rate(coredns_cache_hits_total[5m]))/(sum(rate(coredns_cache_hits_total[5m]))+sum(rate(coredns_cache_misses_total[5m])))", + "seriesNameFormat": "hit ratio" + } + } + } + } + ] + } + } + }, + "layouts": [ + { + "kind": "Grid", + "spec": { + "items": [ + { + "x": 0, + "y": 0, + "width": 12, + "height": 9, + "content": { + "$ref": "#/spec/panels/0" + } + }, + { + "x": 12, + "y": 0, + "width": 12, + "height": 9, + "content": { + "$ref": "#/spec/panels/1" + } + } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "Throughput", + "collapse": { + "open": true + } + }, + "items": [ + { + "x": 0, + "y": 0, + "width": 12, + "height": 9, + "content": { + "$ref": "#/spec/panels/2_0" + } + }, + { + "x": 12, + "y": 0, + "width": 12, + "height": 9, + "content": { + "$ref": "#/spec/panels/2_1" + } + }, + { + "x": 0, + "y": 9, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/2_2" + } + }, + { + "x": 12, + "y": 9, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/2_3" + } + }, + { + "x": 0, + "y": 17, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/2_4" + } + }, + { + "x": 12, + "y": 17, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/2_5" + } + } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "Cache", + "collapse": { + "open": true + } + }, + "items": [ + { + "x": 0, + "y": 0, + "width": 6, + "height": 7, + "content": { + "$ref": "#/spec/panels/3_0" + } + }, + { + "x": 6, + "y": 0, + "width": 6, + "height": 7, + "content": { + "$ref": "#/spec/panels/3_1" + } + }, + { + "x": 12, + "y": 0, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/3_2" + } + } + ] + } + } + ], + "variables": [ + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Prometheus Instance", + "hidden": false + }, + "defaultValue": "kube-monitoring-obs-eu-de-1-prometheus", + "allowAllValue": false, + "allowMultiple": false, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "job", + "matchers": ["prometheus_build_info"] + } + }, + "name": "prometheus-instance" + } + } + ], + "duration": "6h", + "refreshInterval": "1m" + } +} diff --git a/charts/kubernetes-operations/perses-dashboards/kubernetes-container-resources.json b/charts/kubernetes-operations/perses-dashboards/kubernetes-container-resources.json new file mode 100644 index 0000000..67030f3 --- /dev/null +++ b/charts/kubernetes-operations/perses-dashboards/kubernetes-container-resources.json @@ -0,0 +1,889 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "kubernetes-container-resources", + "project": "default" + }, + "spec": { + "display": { + "name": "Kubernetes Container Resources" + }, + "panels": { + "0_0": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU usage", + "description": "# used formulas\n* [container_cpu_usage_seconds_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) to calculate the median CPU usage in % based on the cumulative CPU time consumed. If multiple threads are used their times are added to the sum" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "percent-decimal" + }, + "max": 1.5, + "thresholds": { + "steps": [ + { + "color": "#299c46", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.8 + }, + { + "color": "#d44a3a", + "value": 1 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "quantile(0.5, (rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container=~\"$container\", pod=~\"$pod\"}[5m])))", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_1": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU throttling", + "description": "# used formulas\n* ratio between [container_cpu_cfs_throttled_periods_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) and [container_cpu_cfs_periods_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) to calculate the median CPU throttling in %" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "percent-decimal" + }, + "max": 1, + "thresholds": { + "steps": [ + { + "color": "#299c46", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.25 + }, + { + "color": "#d44a3a", + "value": 0.5 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "quantile(0.50, rate(container_cpu_cfs_throttled_periods_total{namespace=~\"$namespace\",pod=~\"$pod\", container=~\"$container\"}[5m]) / rate(container_cpu_cfs_periods_total{namespace=~\"$namespace\",pod=~\"$pod\", container=~\"$container\"}[5m]))", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_2": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU limits", + "description": "# used formulas\n* [kube_pod_container_resource_limits](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) is used to calculate the total configured CPU limits of the selected pods or containers" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "decimal" + }, + "max": 137438953472, + "thresholds": { + "steps": [ + { + "color": "#badff4", + "value": 0 + }, + { + "color": "#82b5d8", + "value": 1000000000 + }, + { + "color": "#65c5db", + "value": 10000000000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"cpu\",container=~\"$container\", pod=~\"$pod\"})", + "seriesNameFormat": "Cores" + } + } + } + } + ] + } + }, + "0_3": { + "kind": "Panel", + "spec": { + "display": { + "name": "# Containers", + "description": "# used formulas\n* [kube_pod_container_status_ready](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) is used to calculate the number of ready containers of the selected pods or containers" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "decimal" + }, + "max": 137438953472, + "thresholds": { + "steps": [ + { + "color": "#badff4", + "value": 0 + }, + { + "color": "#82b5d8", + "value": 1000000000 + }, + { + "color": "#65c5db", + "value": 10000000000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(min_over_time(kube_pod_container_status_ready{namespace=~\"$namespace\",container=~\"$container\", pod=~\"$pod\"}[5m]))", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_4": { + "kind": "Panel", + "spec": { + "display": { + "name": "RAM requests usage", + "description": "# used formulas\n* median ratio between [container_memory_working_set_bytes](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) and [kube_pod_container_resource_requests](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) in % (provided by the [container_memory_utilization_ratio](https://github.com/sapcc/helm-charts/blob/f1c6d7fe8c9093b16e73d292e2454816a192ec22/prometheus-rules/metrics-regional-rules/templates/aggregations/collector/_resource.rules.tpl#L32) formula)" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "percent-decimal" + }, + "max": 1.5, + "thresholds": { + "steps": [ + { + "color": "#299c46", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.5 + }, + { + "color": "#d44a3a", + "value": 1 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "quantile(0.50, (\n sum(container_memory_working_set_bytes) by (namespace, pod, container)\n /\n sum(kube_pod_container_resource_requests{resource=\"memory\"}) by (namespace, pod, container)\n )\n)", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_5": { + "kind": "Panel", + "spec": { + "display": { + "name": "RAM limits usage", + "description": "# used formulas\n* median ratio between [container_memory_working_set_bytes](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) and [kube_pod_container_resource_limits](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) in % (provided by the [container_memory_saturation_ratio](https://github.com/sapcc/helm-charts/blob/f1c6d7fe8c9093b16e73d292e2454816a192ec22/prometheus-rules/metrics-regional-rules/templates/aggregations/collector/_resource.rules.tpl#L26) formula)\n* 100% saturation means OOMKill!" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "percent-decimal" + }, + "max": 1, + "thresholds": { + "steps": [ + { + "color": "#299c46", + "value": 0 + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.5 + }, + { + "color": "#d44a3a", + "value": 0.8 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "quantile(0.50, (\n sum(container_memory_working_set_bytes) by (namespace, pod, container)\n /\n sum(kube_pod_container_resource_limits{resource=\"memory\"}) by (namespace, pod, container)\n )\n)", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_6": { + "kind": "Panel", + "spec": { + "display": { + "name": "RAM usage", + "description": "# used formulas\n* [container_memory_working_set_bytes](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) is used to calculate the total RAM usage of the selected pods or containers" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "bytes" + }, + "max": 137438953472, + "thresholds": { + "steps": [ + { + "color": "#cffaff", + "value": 0 + }, + { + "color": "#70dbed", + "value": 1000000000 + }, + { + "color": "#6ed0e0", + "value": 10000000000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",container=~\"$container\",pod=~\"$pod\"})", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "0_7": { + "kind": "Panel", + "spec": { + "display": { + "name": "RAM requests", + "description": "# used formulas\n* [kube_pod_container_resource_requests](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) is used to calculate the total RAM requests of the selected pods or containers" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "bytes" + }, + "max": 137438953472, + "thresholds": { + "steps": [ + { + "color": "#badff4", + "value": 0 + }, + { + "color": "#82b5d8", + "value": 1000000000 + }, + { + "color": "#65c5db", + "value": 10000000000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\",resource=\"memory\",container=~\"$container\",pod=~\"$pod\"})", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "1_0": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU usage and throttling ratio", + "description": "# used formulas\n* [container_cpu_usage_seconds_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) to calculate the CPU usage in % based on the cumulative CPU time consumed. If multiple threads are used their times are added to the sum\n* [container_cpu_cfs_throttled_periods_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) to calculate the CPU throttling in % because of defined limits" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "connectNulls": false, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "percent" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (pod, container) (rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\",container=~\"$container\", pod=~\"$pod\"}[5m]))", + "seriesNameFormat": "cpu usage | {{pod}}/{{container}}" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (pod, container) (rate(container_cpu_cfs_throttled_periods_total{namespace=~\"$namespace\",pod=~\"$pod\", container=~\"$container\"}[5m]) / rate(container_cpu_cfs_periods_total[5m]))", + "seriesNameFormat": "cpu throttling | {{pod}}/{{container}}" + } + } + } + } + ] + } + }, + "1_1": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU usage and throttling periods", + "description": "# used formulas\n* [container_cpu_cfs_periods_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) is used to count the amount of CPU request periods that have triggered limit checks\n* [container_cpu_cfs_throttled_periods_total](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) is used to count the amount of throttled CPU request periods because of limit checks" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "round(rate(container_cpu_cfs_periods_total{container=~\"$container\",pod=~\"$pod\"}[5m]), 0.01)", + "seriesNameFormat": "periods | {{pod}}/ {{container}}" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "round(rate(container_cpu_cfs_throttled_periods_total{namespace=~\"$namespace\",container=~\"$container\", pod=~\"$pod\"}[5m]), 0.01)", + "seriesNameFormat": "throttled periods | {{pod}}/{{container}}" + } + } + } + } + ] + } + }, + "2_0": { + "kind": "Panel", + "spec": { + "display": { + "name": "Memory usage", + "description": "# used formulas\n* [container_memory_working_set_bytes](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md) to calculate the RAM usage of the container" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "bytes" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\",container=~\"$container\",pod=~\"$pod\"}) by (namespace, pod, container)", + "seriesNameFormat": "{{pod}}/{{container}}" + } + } + } + } + ] + } + }, + "2_1": { + "kind": "Panel", + "spec": { + "display": { + "name": "Memory usage to requests/limits ratio", + "description": "# used formulas\n* [container_memory_saturation_ratio](https://github.com/sapcc/helm-charts/blob/f1c6d7fe8c9093b16e73d292e2454816a192ec22/prometheus-rules/metrics-regional-rules/templates/aggregations/collector/_resource.rules.tpl#L26) is used to calculate the used memory to configured limits ratio\n* [container_memory_utilization_ratio](https://github.com/sapcc/helm-charts/blob/f1c6d7fe8c9093b16e73d292e2454816a192ec22/prometheus-rules/metrics-regional-rules/templates/aggregations/collector/_resource.rules.tpl#L32) is used to calculate the used memory to configured requests ratio" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "percent" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(container_memory_working_set_bytes) by (namespace, pod, container)\n/\nsum(kube_pod_container_resource_limits{resource=\"memory\"}) by (namespace, pod, container)", + "seriesNameFormat": "used limits | {{pod}}/{{container}}" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum(container_memory_working_set_bytes) by (namespace, pod, container)\n/\nsum(kube_pod_container_resource_requests{resource=\"memory\"}) by (namespace, pod, container)", + "seriesNameFormat": "used request | {{pod}}/{{container}}" + } + } + } + } + ] + } + }, + "RamLimits": { + "kind": "Panel", + "spec": { + "display": { + "name": "Ram Limits", + "description": "# used formulas* [kube_pod_container_resource_limits](https://github.com/kubernetes/kube-state-metrics/blob/main/docs/pod-metrics.md) is used to calculate the total configured RAM limits of the selected pods or containers" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "mean", + "format": { + "unit": "bytes" + }, + "max": 137438953472, + "thresholds": { + "steps": [ + { + "color": "#00baff", + "value": 1000000000 + }, + { + "color": "#4aa7ff", + "value": 10000000000 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\",resource=\"memory\",container=~\"$container\", pod=~\"$pod\"})", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + } + }, + "layouts": [ + { + "kind": "Grid", + "spec": { + "display": { + "title": "Summary", + "collapse": { + "open": true + } + }, + "items": [ + { + "x": 0, + "y": 6, + "width": 5, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_0" + } + }, + { + "x": 0, + "y": 0, + "width": 5, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_2" + } + }, + { + "x": 16, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_7" + } + }, + { + "x": 20, + "y": 6, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_6" + } + }, + { + "x": 5, + "y": 6, + "width": 5, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_1" + } + }, + { + "x": 5, + "y": 0, + "width": 5, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_3" + } + }, + { + "x": 16, + "y": 6, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_5" + } + }, + { + "x": 11, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/0_4" + } + }, + { + "x": 20, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/RamLimits" + } + } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "CPU", + "collapse": { + "open": true + } + }, + "items": [ + { + "x": 0, + "y": 0, + "width": 12, + "height": 12, + "content": { + "$ref": "#/spec/panels/1_0" + } + }, + { + "x": 12, + "y": 0, + "width": 12, + "height": 12, + "content": { + "$ref": "#/spec/panels/1_1" + } + } + ] + } + }, + { + "kind": "Grid", + "spec": { + "display": { + "title": "RAM", + "collapse": { + "open": true + } + }, + "items": [ + { + "x": 0, + "y": 0, + "width": 12, + "height": 10, + "content": { + "$ref": "#/spec/panels/2_0" + } + }, + { + "x": 12, + "y": 0, + "width": 12, + "height": 10, + "content": { + "$ref": "#/spec/panels/2_1" + } + } + ] + } + } + ], + "variables": [ + { + "kind": "ListVariable", + "spec": { + "display": { + "hidden": false + }, + "defaultValue": "$__all", + "allowAllValue": true, + "allowMultiple": true, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "namespace", + "matchers": [] + } + }, + "name": "namespace" + } + }, + { + "kind": "ListVariable", + "spec": { + "display": { + "hidden": false + }, + "defaultValue": "$__all", + "allowAllValue": true, + "allowMultiple": true, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "pod", + "matchers": [ + "container_memory_working_set_bytes{namespace=~\"$namespace\"}" + ] + } + }, + "name": "pod" + } + }, + { + "kind": "ListVariable", + "spec": { + "display": { + "hidden": false + }, + "defaultValue": "$__all", + "allowAllValue": true, + "allowMultiple": true, + "customAllValue": ".*", + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "container", + "matchers": ["container_memory_working_set_bytes{pod=~\"$pod\"}"] + } + }, + "name": "container" + } + } + ], + "duration": "1h", + "refreshInterval": "1m" + } +} diff --git a/charts/kubernetes-operations/perses-dashboards/kubernetes-node.json b/charts/kubernetes-operations/perses-dashboards/kubernetes-node.json new file mode 100644 index 0000000..b544645 --- /dev/null +++ b/charts/kubernetes-operations/perses-dashboards/kubernetes-node.json @@ -0,0 +1,882 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "kubernetes-node", + "project": "default" + }, + "spec": { + "display": { + "name": "Kubernetes Node" + }, + "panels": { + "1": { + "kind": "Panel", + "spec": { + "display": { + "name": "Disk Total" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "bytes" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(node_filesystem_size_bytes{node=~\"$server\", device=~\"/dev/.*\"})", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "2": { + "kind": "Panel", + "spec": { + "display": { + "name": "Total Memory" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "bytes" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "node_memory_MemTotal_bytes{node=~\"$server\"}", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "3": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU Used" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "percent" + }, + "sparkline": {} + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "100 - (avg by (node) (irate(node_cpu_seconds_total{node=~\"$server\",mode=\"idle\"}[5m])) * 100)", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "4": { + "kind": "Panel", + "spec": { + "display": { + "name": "Uptime" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "seconds" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "time() - node_boot_time_seconds{node=~\"$server\"}", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "5": { + "kind": "Panel", + "spec": { + "display": { + "name": "Scrape CPU use" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "percent-decimal" + }, + "sparkline": {} + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(irate(node_scrape_collector_duration_seconds{node=~\"$server\"}[5m]))", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "6": { + "kind": "Panel", + "spec": { + "display": { + "name": "Available memory" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "percent" + }, + "thresholds": { + "steps": [ + { + "value": 50 + }, + { + "color": "#ff4a4a", + "value": 80 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "(node_memory_MemFree_bytes{node=~\"$server\"} / node_memory_MemTotal_bytes{node=~\"$server\"}) * 100", + "seriesNameFormat": "" + } + } + } + } + ] + } + }, + "7": { + "kind": "Panel", + "spec": { + "display": { + "name": "Memory usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "bytes" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "node_memory_MemTotal_bytes{node=~\"$server\"} - node_memory_MemFree_bytes{node=~\"$server\"} - node_memory_Cached_bytes{node=~\"$server\"} - node_memory_Buffers_bytes{node=~\"$server\"} - node_memory_Slab_bytes{node=~\"$server\"}", + "seriesNameFormat": "Used" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "node_memory_Buffers_bytes{node=~\"$server\"}", + "seriesNameFormat": "Buffers" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "node_memory_Cached_bytes{node=~\"$server\"} + node_memory_Slab_bytes{node=~\"$server\"}", + "seriesNameFormat": "Cached" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "node_memory_MemFree_bytes{node=~\"$server\"}", + "seriesNameFormat": "Free" + } + } + } + } + ] + } + }, + "8": { + "kind": "Panel", + "spec": { + "display": { + "name": "Pods Memory Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "bytes" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "topk(5, container_memory_usage_bytes{pod=~\".+\"} AND on(pod) (kube_pod_info{node=\"$server\"}))", + "seriesNameFormat": "{{ namespace }}/{{ pod }}" + } + } + } + } + ] + } + }, + "9": { + "kind": "Panel", + "spec": { + "display": { + "name": "Free Filesystem Space (Lowest)" + }, + "plugin": { + "kind": "GaugeChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "percent-decimal" + }, + "thresholds": { + "steps": [ + { + "value": 0.8 + }, + { + "color": "#ff4a4a", + "value": 0.9 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "min(node_filesystem_free_bytes{fstype=~\"xfs|ext4\",node=~\"$server\"} / node_filesystem_size_bytes{fstype=~\"xfs|ext4\",node=~\"$server\"})", + "seriesNameFormat": "{{instance}}" + } + } + } + } + ] + } + }, + "10": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "percent-decimal" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum by(mode)(irate(node_cpu_seconds_total{node=~\"$server\", mode!=\"idle\"}[5m])) > 0", + "seriesNameFormat": "{{ mode }}" + } + } + } + } + ] + } + }, + "11": { + "kind": "Panel", + "spec": { + "display": { + "name": "Pods CPU Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "topk(5, sum(rate(container_cpu_usage_seconds_total{pod=~\".+\"}[5m])) by (pod, namespace) AND on (pod, namespace) (kube_pod_info{node=\"$server\"}))", + "seriesNameFormat": "{{ namespace }}/{{ pod }}" + } + } + } + } + ] + } + }, + "12": { + "kind": "Panel", + "spec": { + "display": { + "name": "Network Usage" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "connectNulls": false + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "bytes/sec" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "irate(node_network_transmit_bytes_total{node=~\"$server\", device!~\"lo|cbr[0-9]|veth.*\"}[5m]) > 0", + "seriesNameFormat": "{{device}} outbound" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "irate(node_network_receive_bytes_total{node=~\"$server\", device!~\"lo|cbr[0-9]|veth.*\"}[5m]) > 0", + "seriesNameFormat": "{{device}} inbound" + } + } + } + } + ] + } + }, + "13": { + "kind": "Panel", + "spec": { + "display": { + "name": "IOPs" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "display": "line", + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum by (node) (irate(node_disk_reads_completed_total{node=~\"$server\"}[5m]))", + "seriesNameFormat": "reads per second" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum by (node) (irate(node_disk_writes_completed_total{node=~\"$server\"}[5m]))", + "seriesNameFormat": "writes per second" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum by (node) (irate(node_disk_io_time_seconds_total{node=~\"$server\"}[5m]))", + "seriesNameFormat": "io time" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum by (node) (irate(node_disk_reads_completed_total{node=~\"$server\"}[5m])) + sum by (node) (irate(node_disk_writes_completed_total{node=~\"$server\"}[5m]))", + "seriesNameFormat": "IOPS" + } + } + } + } + ] + } + }, + "14": { + "kind": "Panel", + "spec": { + "display": { + "name": "IO Speed by Device" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + }, + "yAxis": { + "show": true, + "label": "", + "format": { + "unit": "milliseconds" + } + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "rate(node_disk_io_time_seconds_total{node=~\"$server\"}[5m])", + "seriesNameFormat": "{{ device }}" + } + } + } + } + ] + } + }, + "15": { + "kind": "Panel", + "spec": { + "display": { + "name": "NFS requests by method" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": {} + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (method) (rate(node_nfsd_requests_total{node=~\"$server\"}[5m]))", + "seriesNameFormat": "{{ method }}" + } + } + } + } + ] + } + }, + "16": { + "kind": "Panel", + "spec": { + "display": { + "name": "NFS RPC Errors" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "position": "bottom" + }, + "visual": { + "areaOpacity": 0.3, + "stack": "all" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "minStep": "", + "query": "sum by (error) (rate(node_nfsd_rpc_errors_total[5m]))", + "seriesNameFormat": "{{ error }}" + } + } + } + } + ] + } + } + }, + "layouts": [ + { + "kind": "Grid", + "spec": { + "items": [ + { + "x": 4, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/1" + } + }, + { + "x": 8, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/2" + } + }, + { + "x": 12, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/3" + } + }, + { + "x": 16, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/4" + } + }, + { + "x": 20, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/5" + } + }, + { + "x": 0, + "y": 0, + "width": 4, + "height": 6, + "content": { + "$ref": "#/spec/panels/6" + } + }, + { + "x": 0, + "y": 6, + "width": 4, + "height": 9, + "content": { + "$ref": "#/spec/panels/9" + } + }, + { + "x": 4, + "y": 6, + "width": 6, + "height": 9, + "content": { + "$ref": "#/spec/panels/7" + } + }, + { + "x": 10, + "y": 6, + "width": 14, + "height": 11, + "content": { + "$ref": "#/spec/panels/8" + } + }, + { + "x": 0, + "y": 15, + "width": 10, + "height": 12, + "content": { + "$ref": "#/spec/panels/10" + } + }, + { + "x": 10, + "y": 17, + "width": 14, + "height": 10, + "content": { + "$ref": "#/spec/panels/11" + } + }, + { + "x": 0, + "y": 27, + "width": 24, + "height": 10, + "content": { + "$ref": "#/spec/panels/12" + } + }, + { + "x": 0, + "y": 37, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/13" + } + }, + { + "x": 12, + "y": 37, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/14" + } + }, + { + "x": 0, + "y": 45, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/15" + } + }, + { + "x": 12, + "y": 45, + "width": 12, + "height": 8, + "content": { + "$ref": "#/spec/panels/16" + } + } + ] + } + } + ], + "variables": [ + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Prometheus Instance", + "hidden": false + }, + "defaultValue": "kube-monitoring-obs-eu-de-1-prometheus", + "allowAllValue": false, + "allowMultiple": false, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "job", + "matchers": ["prometheus_build_info"] + } + }, + "name": "prometheus-instance" + } + }, + { + "kind": "ListVariable", + "spec": { + "display": { + "name": "Node", + "hidden": false + }, + "defaultValue": "shoot--greenhouse--obs-eu-de-1-worker-o4x4e-z1-5665c-kj22d", + "allowAllValue": false, + "allowMultiple": false, + "sort": "alphabetical-asc", + "plugin": { + "kind": "PrometheusLabelValuesVariable", + "spec": { + "labelName": "node", + "matchers": ["kube_node_info"] + } + }, + "name": "server" + } + } + ], + "duration": "6h", + "refreshInterval": "1m" + } +} diff --git a/charts/kubernetes-operations/perses-dashboards/project.json b/charts/kubernetes-operations/perses-dashboards/project.json new file mode 100644 index 0000000..3f9af8e --- /dev/null +++ b/charts/kubernetes-operations/perses-dashboards/project.json @@ -0,0 +1,7 @@ +{ + "kind": "Project", + "metadata": { + "name": "default" + }, + "spec": { "display": { "name": "default" } } +} diff --git a/charts/kubernetes-operations/templates/_helpers.tpl b/charts/kubernetes-operations/templates/_helpers.tpl index 61d416f..3b8e8e6 100644 --- a/charts/kubernetes-operations/templates/_helpers.tpl +++ b/charts/kubernetes-operations/templates/_helpers.tpl @@ -39,3 +39,14 @@ plugin: {{ $root.Release.Name }} {{- end }} {{- end }} {{- end }} + +{{- define "kubernetes-operations.persesDashboardSelectorLabels" }} +{{- $path := index . 0 -}} +{{- $root := index . 1 -}} +plugin: {{ $root.Release.Name }} +{{- if $root.Values.dashboards.persesSelectors }} +{{- range $i, $target := $root.Values.dashboards.persesSelectors }} +{{ $target.name | required (printf "$.Values.dashboards.persesSelectors.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.dashboards.persesSelectors.[%v].value missing" $i)) $ }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/kubernetes-operations/templates/perses-dashboards.yaml b/charts/kubernetes-operations/templates/perses-dashboards.yaml new file mode 100644 index 0000000..c4f5409 --- /dev/null +++ b/charts/kubernetes-operations/templates/perses-dashboards.yaml @@ -0,0 +1,16 @@ +{{- if .Values.dashboards.create }} +{{ $root := . }} +{{- range $path, $bytes := .Files.Glob "perses-dashboards/*.json" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ printf "%s-%s" $root.Release.Name ($path | replace ".json" "" | replace "/" "-" | trunc 63) }} + labels: +{{ include "kubernetes-operations.persesDashboardSelectorLabels" (list $path $root) | indent 4 }} +{{ include "kubernetes-operations.labels" (list $path $root) | indent 4 }} +data: +{{ printf "%s: |-" ($path | replace "/" "-" | indent 2) }} +{{ printf "%s" $bytes | indent 4 }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/charts/kubernetes-operations/values.yaml b/charts/kubernetes-operations/values.yaml index 15cbae3..514d490 100644 --- a/charts/kubernetes-operations/values.yaml +++ b/charts/kubernetes-operations/values.yaml @@ -54,3 +54,8 @@ dashboards: plutonoSelectors: - name: plutono-dashboard value: '"true"' + + # -- Label selectors for the Perses dashboards to be picked up by Perses. + persesSelectors: + - name: perses.dev/resource + value: '"true"'