diff --git a/flux/components/monitoring/configs/flux/alerts.yaml b/flux/components/monitoring/configs/flux/alerts.yaml new file mode 100644 index 00000000..77d86981 --- /dev/null +++ b/flux/components/monitoring/configs/flux/alerts.yaml @@ -0,0 +1,110 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flux-system + labels: + app.kubernetes.io/part-of: spectrum-monitoring + app.kubernetes.io/component: flux +spec: + groups: + - name: flux-system + rules: + - alert: HelmReleaseNotReady + expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0 + for: 5m + labels: + severity: critical + service: fluxcd + annotations: + summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready" + description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes." + + - alert: GitRepositorySyncFailed + expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0 + for: 5m + labels: + severity: critical + service: fluxcd + annotations: + summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed" + description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes." + + - alert: KustomizationNotApplied + expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0 + for: 5m + labels: + severity: critical + service: fluxcd + annotations: + summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied" + description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes." + + - alert: ImageRepositorySyncFailed + expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0 + for: 5m + labels: + severity: critical + service: fluxcd + annotations: + summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed" + description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes." + + - alert: HelmChartFailed + expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0 + for: 5m + labels: + severity: critical + service: fluxcd + annotations: + summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed" + description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes." + + - alert: HelmReleaseSuspended + expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0 + for: 5m + labels: + severity: warning + service: fluxcd + annotations: + summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended" + description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended." + + - alert: GitRepositorySuspended + expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0 + for: 5m + labels: + severity: warning + service: fluxcd + annotations: + summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended" + description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended." + + - alert: KustomizationSuspended + expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0 + for: 5m + labels: + severity: warning + service: fluxcd + annotations: + summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended" + description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended." + + - alert: ImageRepositorySuspended + expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0 + for: 5m + labels: + severity: warning + service: fluxcd + annotations: + summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended" + description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended." + + - alert: HelmChartSuspended + expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0 + for: 5m + labels: + severity: warning + service: fluxcd + annotations: + summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended" + description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended." diff --git a/flux/components/monitoring/configs/flux/kustomization.yaml b/flux/components/monitoring/configs/flux/kustomization.yaml index 47e9f532..b8adc304 100644 --- a/flux/components/monitoring/configs/flux/kustomization.yaml +++ b/flux/components/monitoring/configs/flux/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: monitoring resources: - podmonitor.yaml + - alerts.yaml configMapGenerator: - name: flux-grafana-dashboards files: diff --git a/flux/components/monitoring/controllers/kube-prometheus-stack/release.yaml b/flux/components/monitoring/controllers/kube-prometheus-stack/release.yaml index 95090863..9a4fe5e9 100644 --- a/flux/components/monitoring/controllers/kube-prometheus-stack/release.yaml +++ b/flux/components/monitoring/controllers/kube-prometheus-stack/release.yaml @@ -3,7 +3,7 @@ kind: HelmRelease metadata: name: kube-prometheus-stack spec: - interval: 1h + interval: 5m chartRef: kind: OCIRepository name: kube-prometheus-stack @@ -29,18 +29,129 @@ spec: # https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml values: alertmanager: - enabled: false + enabled: true + alertmanagerSpec: + priorityClassName: spectrum-monitoring + secrets: [ + slack-api-url, + slack-templates + ] + + config: + route: + group_by: + - alertname + - namespace + - severity + group_wait: 30s + group_interval: 5m + repeat_interval: 7d + receiver: blackhole + routes: + - receiver: 'slack-all' + matchers: + - service != "fluxcd" + - severity =~ "warning|critical" + - receiver: 'slack-fluxcd' + matchers: + - service = "fluxcd" + receivers: + - name: blackhole + - name: slack-all + slack_configs: + - channel: '#{{- template "slack_channel_main" . -}}' + api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-all + send_resolved: true + title: '{{ template "slack.main.title" . }}' + text: '{{ template "slack.main.text" . }}' + icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png + - name: slack-fluxcd + slack_configs: + - channel: '#{{- template "slack_channel_flux" . -}}' + api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-fluxcd + send_resolved: true + title: '{{ template "slack.main.title" . }}' + text: '{{ template "slack.main.text" . }}' + icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png + templates: + - '/etc/alertmanager/config/*.tmpl' + - '/etc/alertmanager/secrets/slack-templates/*.tmpl' + templateFiles: + template_1.tmpl: |- + {{ define "__main_title" }} + [{{ .Status | toUpper }} + {{- if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{- template "provider" . -}}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}} {{ with .GroupLabels }} {{- .Values | join " " }} {{- end -}} + {{ end }} + + {{ define "__main_body_firing" }} + {{ range .Alerts }} + *Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}} + *Summary:* {{ .Annotations.summary }} + *Description:* {{ .Annotations.description }} + *Since:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }} + *Details:* + {{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}` + {{ end }}{{ end }} + {{ end }} + + {{ define "__main_body_resolved" }} + {{ range .Alerts }} + *Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}} + *Message:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.summary }}{{ end }} + *Description:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.description }}{{ end }} + *Started at:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }} + *Ended at:* {{ .EndsAt.Local.Format "02/01/06 15:04 UTC" }} + *Details:* + {{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}` + {{ end }}{{ end }} + {{ end }} + + {{ define "slack.main.title" }}{{ template "__main_title" . }}{{ end }} + + {{ define "slack.main.text" }} + {{ if eq (len .Alerts.Firing) 1 -}} + {{ template "__main_body_firing" . }} + {{- else if gt (len .Alerts.Firing) 1 -}} + *Alerts:* {{ template "__main_body_firing" . }} + {{- else -}} + {{ template "__main_body_resolved" . }} + {{- end -}} + {{ end }} + + {{ define "slack.main.dashboard" }}{{ template "__main_dashboard" . }}{{ end }} + {{ define "slack.main.link" }}{{ template "__main_link" . }}{{ end }} + {{ define "slack.main.silence" }}{{ template "__main_silence" . }}{{ end }} + {{ define "slack.main.explore" }}{{ template "__main_explore" . }}{{ end }} + customRules: + KubeStateMetricsListErrors: + severity: info + KubeClientCertificateExpiration: + severity: info + KubeControllerManagerDown: + severity: info + KubeSchedulerDown: + severity: info + PrometheusNotConnectedToAlertmanagers: + severity: info + PrometheusDuplicateTimestamps: + severity: info + PrometheusRuleFailures: + severity: info + KubeProxyDown: + severity: info + prometheusOperator: priorityClassName: spectrum-monitoring prometheus: prometheusSpec: priorityClassName: spectrum-monitoring - retention: 168h + retention: 200h resources: requests: cpu: 200m memory: 200Mi + ruleSelectorNilUsesHelmValues: false serviceMonitorNamespaceSelector: {} serviceMonitorSelector: matchExpressions: