diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f8bb3906..85a6abd6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -46,7 +46,7 @@ jobs: - name: Check which containers changed id: containers_changed run: | - tasks=$(git diff --name-only origin/main..HEAD -- tasks/ | grep -Ev 'run-local.sh|openssl.cnf|install-service|README|mock-|.yaml' || true) + tasks=$(git diff --name-only origin/main..HEAD -- tasks/container) # print for debugging echo "tasks: $tasks" [ -z "$tasks" ] || echo "tasks=true" >> "$GITHUB_OUTPUT" diff --git a/Makefile b/Makefile index c76257fd..ccc9ca6c 100644 --- a/Makefile +++ b/Makefile @@ -1,35 +1,22 @@ all: @echo "usage: make containers" >&2 - @echo " make tasks-shell" >&2 @echo " make tasks-container" >&2 @echo " make tasks-push" >&2 @echo " make check" >&2 check: - python3 -m pyflakes tasks tasks/webhook - python3 -m pycodestyle --max-line-length=120 --ignore=E722 tasks tasks/webhook + python3 -m pyflakes tasks tasks/container/webhook + python3 -m pycodestyle --max-line-length=120 --ignore=E722 tasks tasks/container/webhook TAG := $(shell date --iso-8601) TASK_SECRETS := /var/lib/cockpit-secrets/tasks -WEBHOOK_SECRETS := /var/lib/cockpit-secrets/webhook -TASK_CACHE := /var/cache/cockpit-tasks DOCKER ?= $(shell which podman docker 2>/dev/null | head -n1) containers: tasks-container @true -tasks-shell: - $(DOCKER) run -ti --rm \ - --shm-size=1024m \ - --volume=$(CURDIR)/tasks:/usr/local/bin \ - --volume=$(TASK_SECRETS):/run/secrets/tasks/:ro \ - --volume=$(WEBHOOK_SECRETS):/run/secrets/webhook/:ro \ - --volume=$(TASK_CACHE):/cache:rw \ - --entrypoint=/bin/bash \ - quay.io/cockpit/tasks -i - tasks-container: - $(DOCKER) build -t quay.io/cockpit/tasks:$(TAG) tasks + $(DOCKER) build -t quay.io/cockpit/tasks:$(TAG) tasks/container $(DOCKER) tag quay.io/cockpit/tasks:$(TAG) quay.io/cockpit/tasks:latest tasks-push: diff --git a/ansible/roles/tasks-systemd/tasks/main.yml b/ansible/roles/tasks-systemd/tasks/main.yml index ff55ae0d..2300b8e5 100644 --- a/ansible/roles/tasks-systemd/tasks/main.yml +++ b/ansible/roles/tasks-systemd/tasks/main.yml @@ -96,18 +96,28 @@ '--memory=24g', '--pids-limit=16384', '--shm-size=1024m', + # qcow overlays on tmpfs '--tmpfs=/tmp:size=14g', '--env=TEST_OVERLAY_DIR=/tmp', + + # image cache + '--env=COCKPIT_IMAGES_DATA_DIR=/cache/images', + '--volume=/var/cache/cockpit-tasks/images:/cache/images:rw', + # local image stores '--env=COCKPIT_IMAGE_STORES_FILE=/config/image-stores', '--volume=/var/cache/cockpit-tasks/image-stores:/config/image-stores:ro', + # generic secrets '--volume=/var/lib/cockpit-secrets/tasks/npm-registry.crt:/run/secrets/tasks/npm-registry.crt:ro', + # various configuration '--volume=/etc/npmrc:/etc/npmrc:ro', - '--volume=/var/cache/cockpit-tasks/images:/cache/images:rw', '--env=TEST_JOBS={{ TEST_JOBS | default(8) }}', + # copy git settings from main tasks container + '--env=GIT_COMMITTER_*', + '--env=GIT_AUTHOR_*', ] [container.secrets] diff --git a/tasks/README.md b/tasks/README.md index d81b281a..84e77376 100644 --- a/tasks/README.md +++ b/tasks/README.md @@ -1,22 +1,22 @@ # Cockpit Continuous Integration tasks -This is the container and configuration for the Cockpit integration tests and -automated maintenance tasks. This documentation is for deployment on Fedora -35+, Fedora CoreOS, or RHEL 8+. +This is the [container](./container) and deployment scripts for the Cockpit +integration tests and automated maintenance tasks. The container has optional mounts: - * `/secrets`: A directory for tasks specific secrets, with at least the following files: - * `s3-keys/*`: files with S3 access tokens for image upload/download and task log bucket - * `s3-server.{pem,key}`: TLS certificate for local S3 image cache container - * `/run/secrets/webhook`: A directory for secrets shared with the webhook container, with the following files: - * `.config--github-token`: GitHub token to create and update issues and PRs + * A directory for image files. Defined by `$COCKPIT_IMAGES_DATA_DIR` env + variable, conventionally `/cache/images`. On production hosts, this is + mounted from `/var/cache/cockpit-tasks/images`. + * S3 access tokens for image and log buckets. Defined by `$COCKPIT_S3_KEY_DIR` + env variable, conventionally `/run/secrets/s3-keys`. + On production hosts, this is mounted from `/var/lib/cockpit-secrets/tasks/s3-keys`. + * A directory for GitHub and AMQP secrets. Used by both the tasks and the the webhook container. + Must be in `/run/secrets/webhook` (bots currently assumes that). + * `.config--github-token`: GitHub token to create and update issues and PRs. * `amqp-{client,server}.{pem,key}`: TLS certificates for RabbitMQ * `ca.pem`: The general cockpit CI Certificate Authority which signed the above AMQP certificates - * `/cache`: A directory for reusable cached data such as downloaded image files - -The mounts normally default to `/var/lib/cockpit-secrets/tasks`, -`/var/lib/cockpit-secrets/webhook`, and `/var/cache/cockpit-tasks` on the host. + On production hosts, this is mounted from `/var/lib/cockpit-secrets/webhook`. To generate the [certificates needed for cross-cluster AMQP](https://www.rabbitmq.com/ssl.html) authentication, run the [credentials/webhook/generate.sh script](./credentials/webhook/generate.sh) script. @@ -28,31 +28,27 @@ Run either script in the target directory (e.g. # Deploying/updating on our CI infrastructure This happens through [Ansible](../ansible/) depending on the target cloud. - -Some helpful commands: - - # journalctl -fu cockpit-tasks@* - # systemctl stop cockpit-tasks@* +These tasks containers controlled by systemd units `cockpit-tasks@*`. # Deploying on OpenShift -The testing machines can run on OpenShift cluster(s), as long as they have -support for `/dev/kvm` in containers. Otherwise they will only be able to -process non-test tasks (such as processing the `statistics` or `webhook` -queues). +OpenShift primarily runs the GitHub webhook responder and AMQP server. -If you run tests, you need a persistent shared volume for locally caching -images. Create it with +As `/dev/kvm` support on OpenShift is hard to come by, current bots +`job-runner` and the deployment resources currently only support a tasks +container which processes the `statistics` and `webhook` queues. + +You need a persistent shared volume for `test-results.db` and the Prometheus +database. Create it with oc create -f tasks/images-claim-centosci.yaml Now create all the remaining kubernetes objects. The secrets are created from -the `/var/lib/cockpit-secrets/tasks` directory as described above. For the -webhook secrets a github token `~/.config/github-webhook-token` should be -present. +the `/var/lib/cockpit-secrets/*` directories as described above: make tasks-secrets | oc create -f - - oc create -f tasks/cockpit-tasks.json + oc create -f tasks/cockpit-tasks-webhook.json + oc create -f tasks/cockpit-tasks-centosci.json ## Troubleshooting @@ -62,54 +58,59 @@ Some helpful commands: oc describe pods oc log -f cockpit-tasks-xxxx -Service affinity currently wants all the cockpit-tasks pods to be in the same region. -If you have your own cluster make sure all the nodes are in the same region: +# Deploying locally for development, integration tests - oc patch node node.example.com -p '{"metadata":{"labels": {"region": "infra"}}}' +For hacking on the webhook, task container, bots infrastructure,, or validating +new container images, you can also run a [podman pod](http://docs.podman.io/en/latest/pod.html) +locally with RabbitMQ, webhook, minio S3, and tasks containers. +Without arguments this will run some purely local integration tests: -## Scaling + tasks/run-local.sh -We can scale the number of testing machines in the openshift cluster with this -command: +This will also generate the secrets in a temporary directory, unless they +already exist in `tasks/credentials/`. By default this will use the +[`quay.io/cockpit/tasks:latest`](https://quay.io/repository/cockpit/tasks?tab=tags) +container, but you can run a different tag by setting `$TASKS_TAG`. - oc scale rc cockpit-tasks --replicas=3 +You can also test the whole GitHub → webhook → tasks → GitHub status workflow +on some cockpituous PR with specifying the PR number and a GitHub token: -# Deploying locally for development + tasks/run-local.sh -p 123 -t ~/.config/cockpit-dev/github-token -For hacking on the webhook, image, or task container, or validating new container -images, you can also run a simple [podman pod](http://docs.podman.io/en/latest/pod.html) -locally with RabbitMQ, webhook, images, and tasks containers: +This will run tests-scan/tests-trigger on the given PR and trigger an +[unit-tests](../.cockpit-ci/run) test which simply does `make check`. - $ tasks/run-local.sh +You can get an interactive shell with -This will also generate the secrets in a temporary directory, unless they -already exist in `tasks/credentials/`. By default this will use the -`quay.io/cockpit/{tasks,images}:latest` containers, but you can run a different -tag by setting `$TASKS_TAG` and/or `$IMAGES_TAG`. + tasks/run-local.sh -i -This currently does not yet have any convenient way to inject arbitrary jobs -into the AMQP queue; this will be provided at a later point. However, you can -test the whole GitHub → webhook → tasks → GitHub status workflow on some -cockpituous PR with specifying the PR number and a GitHub token: +to run things manually. For example, use `publish-queue` to inject a job into +AMQP, or run `job-runner` or some bots command. - $ tasks/run-local.sh -p 123 -t ~/.config/github-token +# Running with toolbx -This will run tests-scan/tests-trigger on the given PR and trigger an -[unit-tests](../.cockpit-ci/run) test which simply does `make check`. +This container can also be used for local development with +[toolbx](https://containertoolbx.org/), to get an "official" Cockpit +development environment that's independent from the host: -# Running single container locally +```sh +toolbox create --image quay.io/cockpit/tasks cockpit +toolbox enter cockpit +``` + +# Running single container with production-like resources When you want to debug a problem with a test which may be sensitive to its -particular environment (such as calibrating RAM, /dev/shm sizes, or behaviour -of libvirt in a container, etc.), you can run the tasks container directly with -podman. The production parameters are set in the -[install-service](./install-service) script. You don't need secrets, custom -networks, or most environment settings, the crucial parts are the memory, -device, and image cache configurations. +particular resource configuration (such as calibrating RAM, /dev/shm sizes, or +behaviour of libvirt in a container, etc.), you can run the tasks container +directly with podman. The production parameters are set in the +`job-runner.toml` file in the +[tasks-systemd Ansible role](../ansible/roles/tasks-systemd/tasks/main.yml). +You don't need secrets, custom networks, or most environment settings, the +crucial parts are the memory, device, and image cache configurations. -First of all, if you want to share your host's image cache (which is really a -good idea), temporarily make it writable to the unprivileged user in the -container: +If you want to share your host's image cache (which is really a good idea), +temporarily make it writable to the unprivileged user in the container: ```sh chmod o+w ~/.cache/cockpit-images @@ -215,7 +216,7 @@ sequenceDiagram (2) a cockpit/tasks container that runs the actual [webhook](https://github.com/cockpit-project/cockpituous/blob/main/tasks/webhook). - See the [Kubernetes resources](https://github.com/cockpit-project/cockpituous/blob/main/tasks/cockpit-tasks-webhook.yaml) + See the [Kubernetes resources](./cockpit-tasks-webhook.yaml) for details about the route, service, and pod. That webhook is a fairly straightforward piece of Python that routes the @@ -245,23 +246,13 @@ sequenceDiagram * Some cockpit/tasks bot picks up the event payload from the "webhook" queue, and interprets it with [tests-scan](https://github.com/cockpit-project/bots/blob/main/tests-scan) or [issue-scan](https://github.com/cockpit-project/bots/blob/main/issue-scan) - depending on the event type. This results in a shell command like - `tests-invoke [...]`, `npm-update [...]`, or similar. If this involves any - Red Hat internal resources, like RHEL or Windows images, that command gets - put into the "internal" queue, otherwise into the "public" queue. + depending on the event type. This results in a + [job-runner JSON task](https://github.com/cockpit-project/bots/blob/main/job-runner) + or a shell command like `prometheus-stats`, or similar. If this involves any + Red Hat internal resources, like RHEL images, that command gets put into the + "internal" queue, otherwise into the "public" queue. - * Some cockpit/tasks bot picks up the shell command from the internal or + * Some cockpit/tasks bot picks up the task from the internal or public queue (depending on whether it has access to Red Hat internal infrastructure), executes it, publishes the log, updates the GitHub status, and finally acks the queue item. - -# Using with toolbx - -This container can also be used for local development with -[toolbx](https://containertoolbx.org/), to get an "official" Cockpit -development environment that's independent from the host: - -```sh -toolbox create --image quay.io/cockpit/tasks cockpit -toolbox enter cockpit -``` diff --git a/tasks/cockpit-tasks-centosci.yaml b/tasks/cockpit-tasks-centosci.yaml index 1fc95ff7..c330db20 100644 --- a/tasks/cockpit-tasks-centosci.yaml +++ b/tasks/cockpit-tasks-centosci.yaml @@ -22,6 +22,16 @@ spec: value: '1' - name: COCKPIT_GITHUB_TOKEN_FILE value: /run/secrets/webhook/.config--github-token + - name: COCKPIT_IMAGES_DATA_DIR + value: /cache/images + - name: GIT_COMMITTER_NAME + value: Cockpituous + - name: GIT_COMMITTER_EMAIL + value: cockpituous@cockpit-project.org + - name: GIT_AUTHOR_NAME + value: Cockpituous + - name: GIT_AUTHOR_EMAIL + value: cockpituous@cockpit-project.org volumeMounts: - name: secrets mountPath: /run/secrets/tasks diff --git a/tasks/Containerfile b/tasks/container/Containerfile similarity index 89% rename from tasks/Containerfile rename to tasks/container/Containerfile index 578c47d2..02b35d9f 100644 --- a/tasks/Containerfile +++ b/tasks/container/Containerfile @@ -69,21 +69,19 @@ RUN dnf -y update && \ dnf clean all && \ pip install ruff -COPY cockpit-tasks install-service webhook github_handler.py /usr/local/bin/ +COPY cockpit-tasks webhook github_handler.py /usr/local/bin/ RUN groupadd -g 1111 -r user && useradd -r -g user -u 1111 user --home-dir /work && \ groupadd -g 1001 -r github && useradd -r --no-create-home -g github -u 1001 github && \ mkdir -p /usr/local/bin /cache/images /cache/github && \ mkdir -p /work/.ssh /work/.cache && \ - printf '[user]\n\t\nemail = cockpituous@cockpit-project.org\n\tname = Cockpituous\n[cockpit "bots"]\n\timages-data-dir = /cache/images\n' >/work/.gitconfig && \ chmod g=u /etc/passwd && \ chmod -R ugo+w /cache /work && \ chown -R user:user /cache /work && \ printf '[libdefaults]\ndefault_ccache_name = FILE:/tmp/krb5.ccache\n' > /etc/krb5.conf.d/0_file_ccache && \ echo 'user ALL=NOPASSWD: /usr/bin/chmod 666 /dev/kvm' > /etc/sudoers.d/user-fix-kvm -ENV LANG=C.UTF-8 \ - TEST_OVERLAY_DIR=/tmp +ENV LANG=C.UTF-8 VOLUME /cache/images diff --git a/tasks/cockpit-tasks b/tasks/container/cockpit-tasks similarity index 67% rename from tasks/cockpit-tasks rename to tasks/container/cockpit-tasks index 0780c807..4ff80fe9 100755 --- a/tasks/cockpit-tasks +++ b/tasks/container/cockpit-tasks @@ -9,11 +9,6 @@ COCKPIT_BOTS_BRANCH=${COCKPIT_BOTS_BRANCH:-main} WORKDIR="$PWD" BOTS_DIR="$WORKDIR"/bots -# OpenShift instances with their random user don't set $HOME -if [ -z "${HOME:-}" ] || [ "$HOME" = / ]; then - export HOME=/work -fi - echo "Starting testing" function update_bots() { @@ -45,19 +40,10 @@ for i in $(seq 1 30); do update_bots cd "$BOTS_DIR" - # avoid stale state and sockets - virsh list --id --all | xargs --no-run-if-empty virsh destroy - pkill -9 virtqemud || true - while pgrep virtqemud >/dev/null; do sleep 0.5; done - rm -rf ~/.config/libvirt ~/.cache/libvirt - # run-queue fails on empty queues; don't poll too often timeout 12h ./run-queue ${AMQP_SERVER:+--amqp} ${AMQP_SERVER:-} || slumber - # clean up after tests, in particular large qcow overlays - rm -rf "${TEST_OVERLAY_DIR:-/var/tmp}"/* || true - rm -rf /tmp/.cockpit-test-resources || true done -# Prune old images +# Prune old images on our local cache update_bots ./image-prune diff --git a/tasks/github_handler.py b/tasks/container/github_handler.py similarity index 100% rename from tasks/github_handler.py rename to tasks/container/github_handler.py diff --git a/tasks/webhook b/tasks/container/webhook similarity index 100% rename from tasks/webhook rename to tasks/container/webhook diff --git a/tasks/install-service b/tasks/install-service index 0ac7332a..4c651a29 100755 --- a/tasks/install-service +++ b/tasks/install-service @@ -13,8 +13,6 @@ SECRETS=/var/lib/cockpit-secrets CACHE=/var/cache/cockpit-tasks IMAGE_STORES=${CACHE}/image-stores INSTANCES=${INSTANCES:-4} -# assume the host has plenty of RAM, use a tmpfs for /tmp for getting less IO contention; this can be overridden -TMPVOL=${TMPVOL:-"--tmpfs /tmp:size=14g"} systemctl stop 'cockpit-tasks@*.service' @@ -40,41 +38,34 @@ Requires=podman.socket After=podman.socket [Service] -Environment="TEST_JOBS=${TEST_JOBS:-8}" -Environment="TEST_CACHE=$CACHE" -Environment="TEST_SECRETS=$SECRETS" -Environment="TEST_NOTIFICATION_MX=${TEST_NOTIFICATION_MX:-}" -Environment="TEST_NOTIFICATION_TO=${TEST_NOTIFICATION_TO:-}" Restart=always RestartSec=60 # give image pull enough time TimeoutStartSec=10min ExecStartPre=-/usr/bin/podman rm -f cockpit-tasks-%i -ExecStartPre=-/usr/bin/podman network rm cockpit-tasks-%i # HACK: sometimes images get an MCS category which makes them undeletable by the container -ExecStartPre=/usr/bin/chcon -R -l s0 \${TEST_CACHE}/images/ +ExecStartPre=/usr/bin/chcon -R -l s0 ${CACHE}/images/ ExecStartPre=/usr/bin/flock /tmp/cockpit-image-pull podman pull quay.io/cockpit/tasks -ExecStartPre=/usr/bin/podman network create cockpit-tasks-%i +# job-runner doesn't need /images, but we still need it for the run-queue store-tests task +# FIXME: /dev/kvm should be dropped, but fix bots run-queue for that (indication of whether to run kvm tasks) ExecStart=/usr/bin/podman run --name=cockpit-tasks-%i --hostname=${CONTAINER_HOSTNAME} \ - --device=/dev/kvm --network=cockpit-tasks-%i \ - --memory=24g --pids-limit=16384 --shm-size=1024m ${TMPVOL:-} \ - --volume=\${TEST_CACHE}/images:/cache/images:rw \ - --volume=\${TEST_SECRETS}/tasks:/run/secrets/tasks:ro \ - --volume=\${TEST_SECRETS}/webhook:/run/secrets/webhook:ro \ - --volume=${IMAGE_STORES}:/config/image-stores:ro \ - --volume=/etc/npmrc:/etc/npmrc:ro \ + --device=/dev/kvm \ + --volume=${CACHE}/images:/cache/images:rw \ + --volume=${SECRETS}/tasks:/run/secrets/tasks:ro \ + --volume=${SECRETS}/webhook:/run/secrets/webhook:ro \ --volume=/etc/job-runner.toml:/config/job-runner.toml:ro \ --volume=%t/podman/podman.sock:/podman.sock:rw \ --env=JOB_RUNNER_CONFIG=/config/job-runner.toml \ --env=COCKPIT_GITHUB_TOKEN_FILE=/run/secrets/webhook/.config--github-token \ --env=COCKPIT_S3_KEY_DIR=/run/secrets/tasks/s3-keys \ - --env=COCKPIT_IMAGE_STORES_FILE=/config/image-stores \ - --env=TEST_JOBS=\${TEST_JOBS} \ - --env=TEST_NOTIFICATION_MX=\${TEST_NOTIFICATION_MX} \ - --env=TEST_NOTIFICATION_TO=\${TEST_NOTIFICATION_TO} \ + --env=GIT_COMMITTER_NAME=Cockpituous \ + --env=GIT_COMMITTER_EMAIL=cockpituous@cockpit-project.org \ + --env=GIT_AUTHOR_NAME=Cockpituous \ + --env=GIT_AUTHOR_EMAIL=cockpituous@cockpit-project.org \ + --env=TEST_NOTIFICATION_MX=${TEST_NOTIFICATION_MX} \ + --env=TEST_NOTIFICATION_TO=${TEST_NOTIFICATION_TO} \ quay.io/cockpit/tasks cockpit-tasks --verbose ExecStop=/usr/bin/podman rm -f cockpit-tasks-%i -ExecStop=/usr/bin/podman network rm cockpit-tasks-%i [Install] WantedBy=multi-user.target diff --git a/tasks/run-local.sh b/tasks/run-local.sh index 5a9b7841..7b3c7bb2 100755 --- a/tasks/run-local.sh +++ b/tasks/run-local.sh @@ -117,15 +117,13 @@ EOF create_job_runner_config() { # we never want to push to real GitHub branches in this test - run_args="'--security-opt=label=disable', '--volume=$MYDIR/mock-git-push:/usr/local/bin/git:ro'" - if [ "$1" = "mock" ]; then forge_opts="api-url = '$GHAPI_URL_POD'" # needs to run in pod network so that it can access GITHUB_API_POD - run_args="${run_args}, '--pod=cockpituous'" - run_args="${run_args}, '--env=GITHUB_API=$GHAPI_URL_POD', '--env=COCKPIT_IMAGE_UPLOAD_STORE=$S3_URL_POD/images/'" + run_args="'--pod=cockpituous', '--env=GITHUB_API=$GHAPI_URL_POD'" elif [ "$1" = "real" ]; then forge_opts="" + run_args="" else echo "ERROR: unknown job-runner config $1" >&2 exit 1 @@ -146,7 +144,14 @@ key = [{file="/run/secrets/tasks/s3-keys/localhost.localdomain"}] [container] command = ['podman-remote', '--url=unix:///podman.sock'] -run-args = [$run_args] +run-args = [ + '--security-opt=label=disable', + '--volume=$MYDIR/mock-git-push:/usr/local/bin/git:ro', + '--env=COCKPIT_IMAGE_UPLOAD_STORE=$S3_URL_POD/images/', + '--env=GIT_AUTHOR_*', + '--env=GIT_COMMITTER_*', + $run_args +] [container.secrets] # these are *host* paths, this is podman-remote @@ -249,6 +254,11 @@ EOF --env=S3_LOGS_URL=$S3_URL_POD/logs/ \ --env=COCKPIT_S3_KEY_DIR=/run/secrets/tasks/s3-keys \ --env=COCKPIT_IMAGE_UPLOAD_STORE=$S3_URL_POD/images/ \ + --env=COCKPIT_IMAGES_DATA_DIR=/cache/images \ + --env=GIT_COMMITTER_NAME=Cockpituous \ + --env=GIT_COMMITTER_EMAIL=cockpituous@cockpit-project.org \ + --env=GIT_AUTHOR_NAME=Cockpituous \ + --env=GIT_AUTHOR_EMAIL=cockpituous@cockpit-project.org \ --env=SKIP_STATIC_CHECK=1 \ quay.io/cockpit/tasks:${TASKS_TAG:-latest} bash