-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path2_cluster.sh
executable file
·444 lines (384 loc) · 16.3 KB
/
2_cluster.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/bin/bash
IFS=$'\n'
set +o xtrace -o errexit -o errtrace -o nounset -o pipefail +o history
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
source "${SCRIPT_DIR}/lib.sh"
showNotice "==== Executing $(basename "$0") ===="
set -o xtrace
setContext
showProgress "Control Firewall"
if ! hcloud firewall list --output noheader --output columns=name | grep "^${CONTROL_FIREWALL_NAME}$"; then
hcloud firewall create \
--name "${CONTROL_FIREWALL_NAME}" \
--label "${CLUSTER_SELECTOR}"
fi
if ! hcloud firewall describe "${CONTROL_FIREWALL_NAME}" -o json | jq -r '.applied_to[].label_selector.selector' | grep "^${CONTROL_SELECTOR}$"; then
hcloud firewall apply-to-resource "${CONTROL_FIREWALL_NAME}" \
--type label_selector \
--label-selector "${CONTROL_SELECTOR}"
fi
showProgress "Worker Firewall"
if ! hcloud firewall list --output noheader --output columns=name | grep "^${WORKER_FIREWALL_NAME}$"; then
hcloud firewall create \
--name "${WORKER_FIREWALL_NAME}" \
--label "${CLUSTER_SELECTOR}"
fi
if ! hcloud firewall describe "${WORKER_FIREWALL_NAME}" -o json | jq -r '.applied_to[].label_selector.selector' | grep "^${WORKER_SELECTOR}$"; then
hcloud firewall apply-to-resource "${WORKER_FIREWALL_NAME}" \
--type label_selector \
--label-selector "${WORKER_SELECTOR}"
fi
showProgress "Control load balancer"
if ! hcloud load-balancer list --output noheader --output columns=name | grep "^${CONTROL_LB_NAME}$"; then
hcloud load-balancer create \
--name "${CONTROL_LB_NAME}" \
--label "${CONTROL_SELECTOR}" \
--location "${CONTROL_LB_LOCATION}" \
--type "$( echo ${CONTROL_LB_TYPE} | tr '[:upper:]' '[:lower:]' )"
fi
TARGET_JSON=$( hcloud load-balancer describe "${CONTROL_LB_NAME}" --output json \
| jq ".targets[] | select(.label_selector.selector == \"${CONTROL_SELECTOR}\")" )
if [ -z "${TARGET_JSON}" ]; then
hcloud load-balancer add-target "${CONTROL_LB_NAME}" \
--label-selector "${CONTROL_SELECTOR}"
fi
for PORT in 6443 50000; do
SERVICE_JSON=$( hcloud load-balancer describe "${CONTROL_LB_NAME}" --output json \
| jq ".services[] | select(.listen_port == ${PORT})" )
if [ -z "${SERVICE_JSON}" ]; then
hcloud load-balancer add-service "${CONTROL_LB_NAME}" \
--listen-port "${PORT}" \
--destination-port "${PORT}" \
--protocol tcp
fi
done
showProgress "Worker load balancer"
if ! hcloud load-balancer list --output noheader --output columns=name | grep "^${WORKER_LB_NAME}$"; then
hcloud load-balancer create \
--name "${WORKER_LB_NAME}" \
--label "${WORKER_SELECTOR}" \
--location "${WORKER_LB_LOCATION}" \
--type "$( echo ${WORKER_LB_TYPE} | tr '[:upper:]' '[:lower:]' )"
fi
# Traefik will add targets + services to worker load balancer.
getLoadBalancerIps
showProgress "Generate Talos configs for controlplane and workers"
(
umask 0077
if [ ! -f "${TALOS_SECRETS}" ]; then
talosctl gen secrets -o "${TALOS_SECRETS}"
fi
)
showProgress "Generate talosconfig"
talosctl gen config "${TALOS_CONTEXT}" "https://${CONTROL_LB_IPV4}:6443" \
--with-secrets "${TALOS_SECRETS}" \
--output-types talosconfig \
--output "${TALOSCONFIG}" \
--force
talosctl config endpoint "${CONTROL_LB_IPV4}"
talosctl config nodes "${CONTROL_LB_IPV4}"
(
MERGE_TALOSCONFIG="${TALOSCONFIG}"
# Unset TALOSCONFIG in subshell to run these commands against the default config
TALOSCONFIG=
if ! talosctl --context "talos-default" config info 2>/dev/null; then
talosctl config add "talos-default"
fi
talosctl config context talos-default
if talosctl --context "${TALOS_CONTEXT}" config info 2>/dev/null; then
talosctl config remove "${TALOS_CONTEXT}" --noconfirm
fi
talosctl config merge "${MERGE_TALOSCONFIG}"
)
showProgress "Get disk image id"
IMAGE_ID=$( hcloud image list --selector "${IMAGE_SELECTOR}" --output noheader --output columns=id | tr -d '\n' )
if [ -z "${IMAGE_ID}" ]; then
set +o xtrace
showError "Talos ${TALOS_VERSION} disk image not found at Hetzner Cloud, using selector '${IMAGE_SELECTOR}'."
showError "Please execute '1_hcloud_disk_image.sh' first."
exit 1
fi
showProgress "Start control nodes"
for (( NR=0; NR<${#CONTROL_NAMES[@]}; NR++ )); do
NODE_NAME="${CONTROL_NAMES[${NR}]}"
CONFIG_FILE="${SCRIPT_DIR}/node_${NODE_NAME}.yaml"
CONTROL_EXTRA_OPTS=( '' )
if [ 0 -eq "${WORKER_COUNT}" ]; then
CONTROL_EXTRA_OPTS=( --config-patch "@${SCRIPT_DIR}/deploy/talos-patch-no-workers.yaml" )
fi
(
umask 0077
talosctl gen config "${TALOS_CONTEXT}" "https://${CONTROL_LB_IPV4}:6443" \
--talos-version="${TALOS_VERSION}" \
--with-secrets="${TALOS_SECRETS}" \
--with-docs=false \
--with-examples=false \
--config-patch "@${SCRIPT_DIR}/deploy/talos-patch.yaml" \
--config-patch "@${SCRIPT_DIR}/deploy/talos-patch-control.yaml" \
--config-patch "[
{
\"op\": \"replace\",
\"path\": \"/machine/network/hostname\",
\"value\": \"${NODE_NAME}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/node.kubernetes.io~1instance-type\",
\"value\": \"${CONTROL_TYPE}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/topology.kubernetes.io~1zone\",
\"value\": \"${CONTROL_LOCATION[${NR}]}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/csi.hetzner.cloud~1location\",
\"value\": \"${CONTROL_LOCATION[${NR}]}\"
}
]" \
${CONTROL_EXTRA_OPTS[@]} \
--kubernetes-version="${KUBE_VERSION}" \
--additional-sans "${CONTROL_LB_IPV4},${CONTROL_LB_NAME}" \
--output-types controlplane \
--output "${CONFIG_FILE}" \
--force
)
if hcloud server list --output noheader --output columns=name | grep "^${NODE_NAME}$"; then
showProgress "Apply config to ${NODE_NAME}"
NODE_IPV4="$( getNodePublicIpv4 "${NODE_NAME}" )"
talosctl apply-config \
--file "${CONFIG_FILE}" \
--endpoints "${NODE_IPV4}" \
--nodes "${NODE_IPV4}" || echo "Warning: Apply failed"
continue
fi
hcloud server create \
--name "${NODE_NAME}" \
--image "${IMAGE_ID}" \
--type "${CONTROL_TYPE}" \
--location "${CONTROL_LOCATION[${NR}]}" \
--label "${CONTROL_SELECTOR}" \
--user-data-from-file "${CONFIG_FILE}" # >/dev/null & # Enable if you wish to create in parallel
done
showProgress "Start worker nodes"
for (( NR=0; NR<${#INT_WORKER_NAMES[@]}; NR++ )); do
NODE_NAME="${INT_WORKER_NAMES[${NR}]}"
CONFIG_FILE="${SCRIPT_DIR}/node_${NODE_NAME}.yaml"
VOLUME_MOUNT=( '' )
WORKER_EXTRA_OPTS=( '' )
if [ "${WORKER_DATA_VOLUME}" -gt 0 ]; then
VOLUME_NAME="${NODE_NAME}-data"
if ! hcloud volume list --output noheader --output columns=name | grep "^${VOLUME_NAME}$"; then
hcloud volume create \
--size "${WORKER_DATA_VOLUME}" \
--location "${WORKER_LOCATION[${NR}]}" \
--name "${VOLUME_NAME}" \
--format xfs
fi
VOLUME_MOUNT=( --automount --volume "${VOLUME_NAME}" )
fi
(
umask 0077
talosctl gen config "${TALOS_CONTEXT}" "https://${CONTROL_LB_IPV4}:6443" \
--talos-version="${TALOS_VERSION}" \
--with-secrets "${TALOS_SECRETS}" \
--with-docs=false \
--with-examples=false \
--config-patch "@${SCRIPT_DIR}/deploy/talos-patch.yaml" \
--config-patch "[
{
\"op\": \"replace\",
\"path\": \"/machine/network/hostname\",
\"value\": \"${NODE_NAME}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/node.kubernetes.io~1instance-type\",
\"value\": \"${WORKER_TYPE}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/topology.kubernetes.io~1zone\",
\"value\": \"${WORKER_LOCATION[${NR}]}\"
},
{
\"op\": \"add\",
\"path\": \"/machine/nodeLabels/csi.hetzner.cloud~1location\",
\"value\": \"${WORKER_LOCATION[${NR}]}\"
}
]" \
${WORKER_EXTRA_OPTS[@]} \
--kubernetes-version="${KUBE_VERSION}" \
--additional-sans "${CONTROL_LB_IPV4},${CONTROL_LB_NAME}" \
--output-types worker \
--output "${CONFIG_FILE}" \
--force
)
if hcloud server list --output noheader --output columns=name | grep "^${NODE_NAME}$"; then
showProgress "Apply config to ${NODE_NAME}"
NODE_IPV4="$( getNodePublicIpv4 "${NODE_NAME}" )"
talosctl apply-config \
--file "${CONFIG_FILE}" \
--endpoints "${NODE_IPV4}" \
--nodes "${NODE_IPV4}" || echo "Warning: Apply failed"
continue
fi
hcloud server create \
--name "${NODE_NAME}" \
--image "${IMAGE_ID}" \
--type "${WORKER_TYPE}" \
--location "${WORKER_LOCATION[${NR}]}" \
--label "${WORKER_SELECTOR}" \
--user-data-from-file "${CONFIG_FILE}" \
${VOLUME_MOUNT[@]} # >/dev/null & # Enable if you wish to create in parallel
done
for NODE_NAME in "${INT_NODE_NAMES[@]}"; do
showProgress "Wait till ${NODE_NAME} is running"
for (( TRY=0; TRY<100; TRY++ )); do
if hcloud server list --output noheader --output columns=name,status | grep -E "^${NODE_NAME}\s+running$"; then
break
fi
hcloud server list
sleep 10
done
done
getNodeIps
ENGINEER_IPV4="$( curl --silent --ipv4 ifconfig.io )"
showProgress "Open ports on Control Firewall"
# https://kubernetes.io/docs/reference/networking/ports-and-protocols/#node
# https://www.talos.dev/v1.5/learn-more/talos-network-connectivity/#configuring-network-connectivity
## Traffic from all nodess
openFirewallPorts "${CONTROL_LB_NAME}" "${NODE_IPS_COMMA}" "udp" 51820 51820 "KubeSpan from all nodes"
openFirewallPorts "${CONTROL_LB_NAME}" "${CONTROL_LB_IPV4},${ENGINEER_IPV4}" "tcp" 6443 6443 "Kubernetes API from Control LB + engineer"
openFirewallPorts "${CONTROL_LB_NAME}" "${CONTROL_LB_IPV4},${ENGINEER_IPV4}" "tcp" 50000 50000 "Talos apid from Control LB + engineer"
openFirewallPorts "${CONTROL_LB_NAME}" "0.0.0.0/0" "icmp" 0 0 "ICMP from everywhere"
showProgress "Open ports on Worker Firewall"
openFirewallPorts "${WORKER_LB_NAME}" "${NODE_IPS_COMMA}" "udp" 51820 51820 "KubeSpan from all nodes"
openFirewallPorts "${WORKER_LB_NAME}" "${WORKER_LB_IPV4}" "tcp" 30000 32767 "NodePorts from Worker LB"
openFirewallPorts "${WORKER_LB_NAME}" "${ENGINEER_IPV4}" "tcp" 50000 50000 "Talos apid from engineer"
openFirewallPorts "${WORKER_LB_NAME}" "0.0.0.0/0" "icmp" 0 0 "ICMP from everywhere"
showProgress "Wait all nodes to open port 50000"
for NODE_NAME in "${INT_NODE_NAMES[@]}"; do
_PUBLIC_IPV4="$(getNodePublicIpv4 "${NODE_NAME}")"
waitForTcpPort "${_PUBLIC_IPV4}" 50000
done
showProgress "Bootstrap Talos cluster"
if ! talosctl etcd status --nodes "${CONTROL_IPS[0]}" --endpoints "${CONTROL_IPS[0]}" 2>/dev/null; then
talosctl bootstrap --nodes "${CONTROL_IPS[0]}" --endpoints "${CONTROL_IPS[0]}"
fi
showProgress "KubeSpan Peers (from control1)"
for (( TRY=0; TRY<100; TRY++ )); do
if talosctl --nodes "${CONTROL_IPS[0]}" get kubespanpeerspecs; then
break
fi
done
talosctl --nodes "${CONTROL_IPS[0]}" get kubespanpeerstatuses
showProgress "Update kubeconfig for kubectl"
if [ -n "${USER_KUBECONFIG}" ]; then
KUBECONFIG="${USER_KUBECONFIG}" talosctl kubeconfig --force --nodes "${CONTROL_IPS[0]}" --endpoints "${CONTROL_IPS[0]}"
fi
talosctl kubeconfig --force "${KUBECONFIG}" --nodes "${CONTROL_IPS[0]}" --endpoints "${CONTROL_IPS[0]}"
showProgress "Wait for first control node to become Ready"
waitForTcpPort "${CONTROL_LB_IPV4}" 50000
waitForTcpPort "${CONTROL_LB_IPV4}" 6443
for (( TRY=0; TRY<100; TRY++ )); do
kubectl get nodes || true
if kubectl get nodes --no-headers "${CONTROL1_NAME}" | grep -E "\sReady\s"; then
break
fi
sleep 5
done
showProgress "Wait for cluster to become healthy"
for (( TRY=0; TRY<100; TRY++ )); do
if talosctl health \
--endpoints "${CONTROL_LB_IPV4}" \
--nodes "${CONTROL_IPS[0]}" \
--control-plane-nodes "${CONTROL_IPS_COMMA}" \
--worker-nodes "${WORKER_IPS_COMMA}" \
--wait-timeout 60m
then
break
fi
done
showProgress "Patch nodes to add providerID"
for NODE_NAME in "${INT_NODE_NAMES[@]}"; do
NODE_ID="hcloud://$( hcloud server describe "${NODE_NAME}" -o json | jq -r '.id' )"
if [ "<none>" == "$( kubectl get node "${NODE_NAME}" -o custom-columns=ID:.spec.providerID --no-headers )" ]; then
kubectl patch node "${NODE_NAME}" --patch="{ \"spec\": {\"providerID\":\"${NODE_ID}\"} }"
fi
PROVIDER_ID="$( kubectl get node "${NODE_NAME}" -o custom-columns=ID:.spec.providerID --no-headers )"
if [ "${NODE_ID}" != "${PROVIDER_ID}" ]; then
showError "The providerID of '${NODE_NAME}' in K8S is '${PROVIDER_ID}' while it is '${NODE_ID}' at Hetzner. It is not possible to change this."
exit 1;
fi
done
showProgress "Patch Worker nodes with roles"
for NODE_NAME in "${WORKER_NAMES[@]}"; do
# Adding these labels to the Talos machine config is restricted because of security reasons.
kubectl label node "${NODE_NAME}" node-role.kubernetes.io/worker=true
kubectl label node "${NODE_NAME}" node-role.kubernetes.io/storage-node=true
done
showProgress "Create Hetzner Cloud secret"
NAMESPACE="kube-system"
if ! kubectl get -n "${NAMESPACE}" secret --no-headers -o name | grep -x "secret/hcloud"; then
kubectl -n kube-system create secret generic hcloud --from-literal="token=$( getHcloudToken )"
fi
showProgress "Install Hetzner Cloud Controller Manager using Helm"
HELM_ACTION="install"
NAMESPACE="kube-system"
if helm get manifest --namespace "${NAMESPACE}" hccm &>/dev/null; then
HELM_ACTION="upgrade"
fi
# https://github.com/hetznercloud/hcloud-cloud-controller-manager/blob/main/chart/values.yaml
helm repo add hcloud "https://charts.hetzner.cloud"
helm repo update hcloud
helm "${HELM_ACTION}" hccm hcloud/hcloud-cloud-controller-manager \
--namespace "${NAMESPACE}" \
--values "${SCRIPT_DIR}/deploy/hcloud-ccm-values.yaml" \
--set "env.HCLOUD_LOAD_BALANCERS_LOCATION.value=${DEFAULT_LB_LOCATION}"
showProgress "Install Local Path Storage"
kubectl apply -f "${DEPLOY_DIR}/local-path-storage.yaml"
showProgress "Install Hetzner Cloud Container Storage Interface (CSI) using Helm"
HELM_ACTION="install"
NAMESPACE="kube-system"
if helm get manifest --namespace "${NAMESPACE}" hcloud-csi &>/dev/null; then
HELM_ACTION="upgrade"
fi
# https://github.com/hetznercloud/csi-driver/tree/main/chart
helm "${HELM_ACTION}" hcloud-csi hcloud/hcloud-csi \
--namespace "${NAMESPACE}" \
--values "${DEPLOY_DIR}/hcloud-csi-values.yaml"
showProgress "Patch CSI Nodes to add driver"
for NODE_NAME in "${INT_NODE_NAMES[@]}"; do
NODE_ID="$( hcloud server describe "${NODE_NAME}" -o json | jq -r '.id' )"
LOCATION="$( hcloud server describe "${NODE_NAME}" -o json | jq -r .datacenter.location.name )"
kubectl patch node "${NODE_NAME}" --patch="{
\"metadata\": {
\"labels\": {
\"csi.hetzner.cloud/location\": \"${LOCATION}\"
}
}
}"
kubectl patch csinode "${NODE_NAME}" --patch="{
\"spec\": {
\"drivers\": [
{
\"allocatable\": {
\"count\": 16
},
\"name\": \"csi.hetzner.cloud\",
\"nodeID\": \"${NODE_ID}\",
\"topologyKeys\": [
\"csi.hetzner.cloud/location\"
]
}
]
}
}"
done
showProgress "Show nodes"
kubectl get nodes -o wide
set +o xtrace
showWarning "You can now use kubectl, to switch to this environment execute first: source ./env.sh"
showNotice "==== Finished $(basename "$0") ===="