Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: flux alerts + optional alerting to Slack #169

Merged
merged 63 commits into from
Mar 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
ca60eb3
test metric push
enjenjenje Mar 4, 2025
85e6951
test metric push
enjenjenje Mar 4, 2025
bd7ef8c
test metric push
enjenjenje Mar 4, 2025
57a2c5b
test metric push
enjenjenje Mar 4, 2025
167178f
lol kek
enjenjenje Mar 4, 2025
5ad2ff6
lol kek
enjenjenje Mar 4, 2025
71927cd
lol kek
enjenjenje Mar 4, 2025
0bd12a4
lol kek
enjenjenje Mar 4, 2025
7b466fc
lol kek
enjenjenje Mar 4, 2025
a7fe80a
lol kek
enjenjenje Mar 4, 2025
3a3a802
lol kek
enjenjenje Mar 4, 2025
b744a22
lol kek
enjenjenje Mar 4, 2025
376ce2f
lol kek
enjenjenje Mar 4, 2025
34ee67d
lol kek
enjenjenje Mar 4, 2025
2102036
lol kek
enjenjenje Mar 5, 2025
1882137
lol kek
enjenjenje Mar 5, 2025
b830613
lol kek
enjenjenje Mar 5, 2025
6fba225
lol kek
enjenjenje Mar 5, 2025
60d86b7
lolkek
enjenjenje Mar 5, 2025
7dd5a67
lolkek
enjenjenje Mar 5, 2025
a160909
lolkek
enjenjenje Mar 5, 2025
8a09600
lolkek
enjenjenje Mar 5, 2025
0ea173c
lolkek
enjenjenje Mar 5, 2025
1139f60
lolkek
enjenjenje Mar 5, 2025
215ad49
lolkek
enjenjenje Mar 5, 2025
ed23310
lolkek
enjenjenje Mar 5, 2025
0ae0bd6
lolkek
enjenjenje Mar 5, 2025
8ef7ea1
lolkek
enjenjenje Mar 5, 2025
87b709c
lolkek
enjenjenje Mar 5, 2025
ee45619
lolkek
enjenjenje Mar 5, 2025
5f7434b
lolkek
enjenjenje Mar 5, 2025
0555146
lolkek
enjenjenje Mar 5, 2025
d25fb0d
lolkek
enjenjenje Mar 5, 2025
8994a44
lolkek
enjenjenje Mar 5, 2025
3c55933
lolkek
enjenjenje Mar 5, 2025
1a6c580
lolkek
enjenjenje Mar 6, 2025
300047b
lolkek
enjenjenje Mar 6, 2025
06246ef
lolkek
enjenjenje Mar 6, 2025
a57f533
lolkek
enjenjenje Mar 6, 2025
bcf79da
lolkek
enjenjenje Mar 6, 2025
edc457a
lolkek
enjenjenje Mar 6, 2025
6a6c84a
lolkek
enjenjenje Mar 6, 2025
9ec8c66
lolkek
enjenjenje Mar 6, 2025
2066de6
lolkek
enjenjenje Mar 6, 2025
e1f3ff2
lolkek
enjenjenje Mar 6, 2025
5bfc541
lolkek
enjenjenje Mar 6, 2025
9894bdd
lolkek
enjenjenje Mar 6, 2025
c7fd316
lolkek
enjenjenje Mar 6, 2025
8612be4
lolkek
enjenjenje Mar 6, 2025
8891ba6
lolkek
enjenjenje Mar 6, 2025
01f56a2
lolkek
enjenjenje Mar 6, 2025
b6fae15
lolkek
enjenjenje Mar 6, 2025
1e65c32
lolkek
enjenjenje Mar 6, 2025
bfd7ae3
lolkek
enjenjenje Mar 6, 2025
e44607e
lolkek
enjenjenje Mar 6, 2025
5f07714
lolkek
enjenjenje Mar 6, 2025
a238d06
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
b01448e
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
bfdd7ac
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
4a34734
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
d479c52
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
0e38ebc
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
357676d
moved alerts to fluxcd config dir
enjenjenje Mar 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions flux/components/monitoring/configs/flux/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: flux-system
labels:
app.kubernetes.io/part-of: spectrum-monitoring
app.kubernetes.io/component: flux
spec:
groups:
- name: flux-system
rules:
- alert: HelmReleaseNotReady
expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0
for: 5m
labels:
severity: critical
service: fluxcd
annotations:
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready"
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes."

- alert: GitRepositorySyncFailed
expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0
for: 5m
labels:
severity: critical
service: fluxcd
annotations:
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."

- alert: KustomizationNotApplied
expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0
for: 5m
labels:
severity: critical
service: fluxcd
annotations:
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied"
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes."

- alert: ImageRepositorySyncFailed
expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0
for: 5m
labels:
severity: critical
service: fluxcd
annotations:
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."

- alert: HelmChartFailed
expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0
for: 5m
labels:
severity: critical
service: fluxcd
annotations:
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed"
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes."

- alert: HelmReleaseSuspended
expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
annotations:
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

- alert: GitRepositorySuspended
expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
annotations:
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

- alert: KustomizationSuspended
expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
annotations:
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

- alert: ImageRepositorySuspended
expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
annotations:
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

- alert: HelmChartSuspended
expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
annotations:
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
1 change: 1 addition & 0 deletions flux/components/monitoring/configs/flux/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: Kustomization
namespace: monitoring
resources:
- podmonitor.yaml
- alerts.yaml
configMapGenerator:
- name: flux-grafana-dashboards
files:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: HelmRelease
metadata:
name: kube-prometheus-stack
spec:
interval: 1h
interval: 5m
chartRef:
kind: OCIRepository
name: kube-prometheus-stack
Expand All @@ -29,18 +29,129 @@ spec:
# https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/values.yaml
values:
alertmanager:
enabled: false
enabled: true
alertmanagerSpec:
priorityClassName: spectrum-monitoring
secrets: [
slack-api-url,
slack-templates
]

config:
route:
group_by:
- alertname
- namespace
- severity
group_wait: 30s
group_interval: 5m
repeat_interval: 7d
receiver: blackhole
routes:
- receiver: 'slack-all'
matchers:
- service != "fluxcd"
- severity =~ "warning|critical"
- receiver: 'slack-fluxcd'
matchers:
- service = "fluxcd"
receivers:
- name: blackhole
- name: slack-all
slack_configs:
- channel: '#{{- template "slack_channel_main" . -}}'
api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-all
send_resolved: true
title: '{{ template "slack.main.title" . }}'
text: '{{ template "slack.main.text" . }}'
icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png
- name: slack-fluxcd
slack_configs:
- channel: '#{{- template "slack_channel_flux" . -}}'
api_url_file: /etc/alertmanager/secrets/slack-api-url/slack-fluxcd
send_resolved: true
title: '{{ template "slack.main.title" . }}'
text: '{{ template "slack.main.text" . }}'
icon_url: https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/500px-Prometheus_software_logo.svg.png
templates:
- '/etc/alertmanager/config/*.tmpl'
- '/etc/alertmanager/secrets/slack-templates/*.tmpl'
templateFiles:
template_1.tmpl: |-
{{ define "__main_title" }}
[{{ .Status | toUpper }}
{{- if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{- template "provider" . -}}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}} {{ with .GroupLabels }} {{- .Values | join " " }} {{- end -}}
{{ end }}

{{ define "__main_body_firing" }}
{{ range .Alerts }}
*Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}}
*Summary:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Since:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }}
*Details:*
{{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}`
{{ end }}{{ end }}
{{ end }}

{{ define "__main_body_resolved" }}
{{ range .Alerts }}
*Cluster:* {{ template "provider" . }}-{{- template "cluster_name" . -}}-{{- template "ip_address" . -}}
*Message:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.summary }}{{ end }}
*Description:* {{ if .Annotations.resolved }}{{ .Annotations.resolved }}{{ else }}{{ .Annotations.description }}{{ end }}
*Started at:* {{ .StartsAt.Local.Format "02/01/06 15:04 UTC" }}
*Ended at:* {{ .EndsAt.Local.Format "02/01/06 15:04 UTC" }}
*Details:*
{{ range .Labels.SortedPairs }}• {{ .Name }}: `{{ .Value }}`
{{ end }}{{ end }}
{{ end }}

{{ define "slack.main.title" }}{{ template "__main_title" . }}{{ end }}

{{ define "slack.main.text" }}
{{ if eq (len .Alerts.Firing) 1 -}}
{{ template "__main_body_firing" . }}
{{- else if gt (len .Alerts.Firing) 1 -}}
*Alerts:* {{ template "__main_body_firing" . }}
{{- else -}}
{{ template "__main_body_resolved" . }}
{{- end -}}
{{ end }}

{{ define "slack.main.dashboard" }}{{ template "__main_dashboard" . }}{{ end }}
{{ define "slack.main.link" }}{{ template "__main_link" . }}{{ end }}
{{ define "slack.main.silence" }}{{ template "__main_silence" . }}{{ end }}
{{ define "slack.main.explore" }}{{ template "__main_explore" . }}{{ end }}
customRules:
KubeStateMetricsListErrors:
severity: info
KubeClientCertificateExpiration:
severity: info
KubeControllerManagerDown:
severity: info
KubeSchedulerDown:
severity: info
PrometheusNotConnectedToAlertmanagers:
severity: info
PrometheusDuplicateTimestamps:
severity: info
PrometheusRuleFailures:
severity: info
KubeProxyDown:
severity: info

prometheusOperator:
priorityClassName: spectrum-monitoring
prometheus:
prometheusSpec:
priorityClassName: spectrum-monitoring
retention: 168h
retention: 200h
resources:
requests:
cpu: 200m
memory: 200Mi

ruleSelectorNilUsesHelmValues: false
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector:
matchExpressions:
Expand Down
Loading