Merge pull request #338 from alexmwu/logging-fix

Fix logging blocking issue
google · Aug 29, 2023 · d86a047 · d86a047
2 parents 884b941 + 2531da7
commit d86a047
Show file tree

Hide file tree

Showing 37 changed files with 821 additions and 123 deletions.
diff --git a/cloudbuild.yaml b/cloudbuild.yaml
@@ -108,7 +108,7 @@ steps:
     cd launcher/image/test
     echo "running launch policy tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}"
     gcloud builds submit --config=test_launchpolicy_cloudbuild.yaml --region us-west1 \
-      --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
+      --substitutions _HARDENED_IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
     exit
 
 - name: 'gcr.io/cloud-builders/gcloud'
@@ -139,6 +139,22 @@ steps:
     gcloud builds submit --config=test_ingress_network.yaml --region us-west1 \
       --substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
     exit
+- name: 'gcr.io/cloud-builders/gcloud'
+  id: LogRedirectionTests
+  waitFor: ['HardenedImageBuild']
+  env:
+  - 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
+  - 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
+  - 'PROJECT_ID=$PROJECT_ID'
+  script: |
+    #!/usr/bin/env bash
+
+    cd launcher/image/test
+    echo "running log redirection tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}"
+    gcloud builds submit --config=test_log_redirection.yaml --region us-west1 \
+      --substitutions _HARDENED_IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
+    exit
+
 
 options:
   pool:

diff --git a/launcher/container_runner.go b/launcher/container_runner.go
@@ -42,10 +42,11 @@ import (
 
 // ContainerRunner contains information about the container settings
 type ContainerRunner struct {
-	container   containerd.Container
-	launchSpec  spec.LaunchSpec
-	attestAgent agent.AttestationAgent
-	logger      *log.Logger
+	container     containerd.Container
+	launchSpec    spec.LaunchSpec
+	attestAgent   agent.AttestationAgent
+	logger        *log.Logger
+	serialConsole *os.File
 }
 
 const (
@@ -100,7 +101,7 @@ func fetchImpersonatedToken(ctx context.Context, serviceAccount string, audience
 }
 
 // NewRunner returns a runner.
-func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.Token, launchSpec spec.LaunchSpec, mdsClient *metadata.Client, tpm io.ReadWriteCloser, logger *log.Logger) (*ContainerRunner, error) {
+func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.Token, launchSpec spec.LaunchSpec, mdsClient *metadata.Client, tpm io.ReadWriteCloser, logger *log.Logger, serialConsole *os.File) (*ContainerRunner, error) {
 	image, err := initImage(ctx, cdClient, launchSpec, token)
 	if err != nil {
 		return nil, err
@@ -241,6 +242,7 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
 		launchSpec,
 		agent.CreateAttestationAgent(tpm, client.GceAttestationKeyECC, verifierClient, principalFetcher),
 		logger,
+		serialConsole,
 	}, nil
 }
 
@@ -506,12 +508,24 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
 	}
 
 	var streamOpt cio.Opt
-	if r.launchSpec.LogRedirect {
-		streamOpt = cio.WithStreams(nil, r.logger.Writer(), r.logger.Writer())
-		r.logger.Println("container stdout/stderr will be redirected")
-	} else {
+	switch r.launchSpec.LogRedirect {
+	case spec.Nowhere:
 		streamOpt = cio.WithStreams(nil, nil, nil)
-		r.logger.Println("container stdout/stderr will not be redirected")
+		r.logger.Println("Container stdout/stderr will not be redirected.")
+	case spec.Everywhere:
+		w := io.MultiWriter(os.Stdout, r.serialConsole)
+		streamOpt = cio.WithStreams(nil, w, w)
+		r.logger.Println("Container stdout/stderr will be redirected to serial and Cloud Logging. " +
+			"This may result in performance issues due to slow serial console writes.")
+	case spec.CloudLogging:
+		streamOpt = cio.WithStreams(nil, os.Stdout, os.Stdout)
+		r.logger.Println("Container stdout/stderr will be redirected to Cloud Logging.")
+	case spec.Serial:
+		streamOpt = cio.WithStreams(nil, r.serialConsole, r.serialConsole)
+		r.logger.Println("Container stdout/stderr will be redirected to serial logging. " +
+			"This may result in performance issues due to slow serial console writes.")
+	default:
+		return fmt.Errorf("unknown logging redirect location: %v", r.launchSpec.LogRedirect)
 	}
 
 	task, err := r.container.NewTask(ctx, cio.NewCreator(streamOpt))

diff --git a/launcher/go.mod b/launcher/go.mod
@@ -4,7 +4,6 @@ go 1.20
 
 require (
 	cloud.google.com/go/compute v1.7.0
-	cloud.google.com/go/logging v1.4.2
 	github.com/cenkalti/backoff/v4 v4.1.3
 	github.com/containerd/containerd v1.6.18
 	github.com/golang-jwt/jwt/v4 v4.4.1

diff --git a/launcher/image/container-runner.service b/launcher/image/container-runner.service
@@ -7,8 +7,8 @@ After=network-online.target gcr-online.target containerd.service
 ExecStart=/usr/share/oem/confidential_space/cs_container_launcher
 ExecStopPost=/usr/share/oem/confidential_space/exit_script.sh
 Restart=no
-StandardOutput=journal+console
-StandardError=journal+console
+StandardOutput=journal
+StandardError=journal
 
 [Install]
 WantedBy=multi-user.target
diff --git a/launcher/image/entrypoint.sh b/launcher/image/entrypoint.sh
@@ -1,12 +1,16 @@
 #!/bin/bash
 
 main() {
-  # copy systemd files
+  # Copy service files.
   cp /usr/share/oem/confidential_space/container-runner.service /etc/systemd/system/container-runner.service
+  # Override default fluent-bit config.
+  cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf
 
   systemctl daemon-reload
   systemctl enable container-runner.service
   systemctl start container-runner.service
+  systemctl start fluent-bit.service
+
 }
 
 main
diff --git a/launcher/image/fluent-bit-cs.conf b/launcher/image/fluent-bit-cs.conf
@@ -0,0 +1,65 @@
+#
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Forked from https://cos.googlesource.com/cos/overlays/board-overlays/+/refs/heads/master/project-lakitu/app-admin/fluent-bit/files/fluent-bit.conf
+
+[SERVICE]
+    # Flush
+    # =====
+    # set an interval of seconds before to flush records to a destination
+    flush        1
+    # Daemon
+    # ======
+    # instruct Fluent Bit to run in foreground or background mode.
+    daemon       Off
+    # Log_Level
+    # =========
+    # Set the verbosity level of the service, values can be:
+    #
+    # - error
+    # - warning
+    # - info
+    # - debug
+    # - trace
+    #
+    # by default 'info' is set, that means it includes 'error' and 'warning'.
+    log_level    info
+    # Storage
+    # =======
+    # Fluent Bit can use memory and filesystem buffering based mechanisms
+    #
+    # - https://docs.fluentbit.io/manual/administration/buffering-and-storage
+    #
+    # storage metrics
+    # ---------------
+    # publish storage pipeline metrics in '/api/v1/storage'. The metrics are
+    # exported only if the 'http_server' option is enabled.
+    #
+    storage.metrics on
+
+# Collects CS launcher and workload logs.
+[INPUT]
+    Name systemd
+    Tag  confidential-space-launcher
+    Systemd_Filter _SYSTEMD_UNIT=container-runner.service
+    DB /var/log/google-fluentbit/container-runner.log.db
+    Read_From_Tail False
+
+[OUTPUT]
+    Name        stackdriver
+    Match       *
+    Resource    gce_instance
+    severity_key severity
diff --git a/launcher/image/preload.sh b/launcher/image/preload.sh
@@ -51,14 +51,20 @@ configure_necessary_systemd_units() {
   # Dependencies of container-runner.service.
   enable_unit "network-online.target"
   enable_unit "gcr-online.target"
+
+}
+
+configure_cloud_logging() {
+  # Copy CS-specific fluent-bit config to OEM partition.
+  cp fluent-bit-cs.conf "${CS_PATH}"
 }
 
 configure_systemd_units_for_debug() {
-  # No-op for now, as debug will default to using multi-user.target.
-  :
+  configure_cloud_logging
 }
 configure_systemd_units_for_hardened() {
   configure_necessary_systemd_units
+  configure_cloud_logging
   # Make entrypoint (via cloud-init) the default unit.
   set_default_boot_target "cloud-final.service"
 
@@ -70,6 +76,10 @@ configure_systemd_units_for_hardened() {
   disable_unit "google-startup-scripts.service"
   disable_unit "google-shutdown-scripts.service"
   disable_unit "konlet-startup.service"
+  disable_unit "crash-reporter.service"
+  disable_unit "device_policy_manager.service"
+  disable_unit "node-problem-detector.service"
+  disable_unit "docker-events-collector-fluent-bit.service"
   disable_unit "sshd.service"
   disable_unit "var-lib-toolbox.mount"
 }

diff --git a/launcher/image/test/README.md b/launcher/image/test/README.md
@@ -44,6 +44,7 @@ Scripts in `util/` contain functions that can be sourced from other test scripts
 * `/workspace/status.txt` contains the success/failure message from test steps.
 `check_failure.sh` looks for a failed message in the step to determine whether
 the cloud build is successful.
+* `workspace/next_start.txt` is used when reading the serial logs.
 
 ## Test Failures
 Due to the sequential/only-proceed-with-success nature of Cloud Build, tests

diff --git a/...test/test_cloud_init_userdata_disabled.sh → ...ipts/test_cloud_init_userdata_disabled.sh b/...test/test_cloud_init_userdata_disabled.sh → ...ipts/test_cloud_init_userdata_disabled.sh
diff --git a/...cher/image/test/test_launcher_workload.sh → ...ge/test/scripts/test_launcher_workload.sh b/...cher/image/test/test_launcher_workload.sh → ...ge/test/scripts/test_launcher_workload.sh
diff --git a/launcher/image/test/scripts/test_launcher_workload_cloudlogging.sh b/launcher/image/test/scripts/test_launcher_workload_cloudlogging.sh
@@ -0,0 +1,111 @@
+ #!/bin/bash
+set -euo pipefail
+source util/read_cloud_logging.sh
+
+# This test requires the workload to run and print
+# corresponding messages to cloud logging.
+CLOUD_LOGGING_OUTPUT=$(read_cloud_logging $1) 
+print_logs=false
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'Workload running'
+then
+    echo "- workload running verified"
+else
+    echo "FAILED: workload not running"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'Workload args: \[/main newCmd\]'
+then
+    echo "- arguments verified"
+else
+    echo "FAILED: arguments not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'env_bar=val_bar'
+then
+    echo "- env_bar env var verified"
+else
+    echo "FAILED: env_bar env not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'ALLOWED_OVERRIDE=overridden'
+then
+    echo "- ALLOWED_OVERRIDE env var verified"
+else
+    echo "FAILED: ALLOWED_OVERRIDE env not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'aud: https://sts.googleapis.com'
+then
+    echo "- token aud verified"
+else
+    echo "FAILED: token aud not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'iss: https://confidentialcomputing.googleapis.com'
+then
+    echo "- token iss verified"
+else
+    echo "FAILED: token iss not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'secboot: true'
+then
+    echo "- token secboot verified"
+else
+    echo "FAILED: token secboot not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'oemid: 11129'
+then
+    echo "- token oemid verified"
+else
+    echo "FAILED: token oemid not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'hwmodel: GCP_AMD_SEV'
+then
+    echo "- token hwmodel verified"
+else
+    echo "FAILED: token hwmodel not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'swname: GCE'
+then
+    echo "- token swname verified"
+else
+    echo "FAILED: token swname not verified"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'Token looks okay'
+then
+    echo "- OIDC token accessible"
+else
+    echo "FAILED: OIDC token not accessible"
+    echo 'TEST FAILED.' > /workspace/status.txt
+    print_logs=true
+fi
+
+if $print_logs; then
+    echo $CLOUD_LOGGING_OUTPUT
+fi
diff --git a/launcher/image/test/scripts/test_launchpolicy_cmd_cloudlogging.sh b/launcher/image/test/scripts/test_launchpolicy_cmd_cloudlogging.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -euo pipefail
+source util/read_cloud_logging.sh
+
+# Allow VM some time to boot and write to serial console.
+sleep 120
+
+CLOUD_LOGGING_OUTPUT=$(read_cloud_logging $1)
+if echo $CLOUD_LOGGING_OUTPUT | grep -q 'CMD is not allowed to be overridden on this image'
+then
+    echo "- CMD launch policy verified"
+else
+    echo "FAILED: CMD launch policy verification"
+    echo 'TEST FAILED' > /workspace/status.txt
+    echo $CLOUD_LOGGING_OUTPUT
+fi
diff --git a/launcher/image/test/scripts/test_launchpolicy_env_cloudlogging.sh b/launcher/image/test/scripts/test_launchpolicy_env_cloudlogging.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -euo pipefail
+source util/read_cloud_logging.sh
+
+# Allow VM some time to boot and write to cloud logging.
+sleep 120
+
+CLOUD_LOGGING_OUTPUT=$(read_cloud_logging $1)
+if echo $CLOUD_LOGGING_OUTPUT | grep -q --fixed-strings 'env var {OUT a} is not allowed to be overridden on this image; allowed envs to be overridden: [ALLOWED_OVERRIDE]'
+then
+    echo "- Env launch policy verified"
+else
+    echo "FAILED: Env launch policy verification"
+    echo 'TEST FAILED' > /workspace/status.txt
+    echo $CLOUD_LOGGING_OUTPUT
+fi
diff --git a/...age/test/scripts/test_launchpolicy_log.sh → ...st/scripts/test_launchpolicy_log_debug.sh b/...age/test/scripts/test_launchpolicy_log.sh → ...st/scripts/test_launchpolicy_log_debug.sh