canonical · boukeas · Nov 11, 2024 · Nov 12, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/agent/testflinger_agent/job.py b/agent/testflinger_agent/job.py
@@ -19,7 +19,8 @@
 import time
 
 from testflinger_agent.errors import TFServerError
-from .runner import CommandRunner, RunnerEvents
+from testflinger_agent.masking import Masker
+from .runner import CommandRunner, MaskingCommandRunner, RunnerEvents
 from .handlers import LiveOutputHandler, LogUpdateHandler
 from .stop_condition_checkers import (
     JobCancelledChecker,
@@ -43,6 +44,41 @@ def __init__(self, job_data, client):
         self.job_id = job_data.get("job_id")
         self.phase = "unknown"
 
+    def get_runner(self, rundir):
+        hash_length = 6
+        # retrieve sensitive patterns from the agent configuration
+        sensitive_patterns = self.client.config.get("sensitive_patterns", [])
+        # retrieve secrets from the job
+        secrets_dict = self.job_data.get("secrets")
+
+        if secrets_dict:
+            # inject secrets into the runner environment
+            environment = {
+                **self.client.config,
+                **secrets_dict,
+            }
+            # add secrets to the sensitive variables
+            sensitive_patterns.extend(secrets_dict.values())
+            # secrets and (possibly) sensitive information
+            masker = Masker(
+                patterns=sensitive_patterns, hash_length=hash_length
+            )
+            return MaskingCommandRunner(
+                cwd=rundir, env=environment, masker=masker
+            )
+
+        if sensitive_patterns:
+            # sensitive patterns but no sensitive variables and no secrets
+            masker = Masker(
+                patterns=sensitive_patterns, hash_length=hash_length
+            )
+            return MaskingCommandRunner(
+                cwd=rundir, env=self.client.config, masker=masker
+            )
+
+        # no secrets, no sensitive information: use regular runner
+        return CommandRunner(cwd=rundir, env=self.client.config)
+
     def run_test_phase(self, phase, rundir):
         """Run the specified test phase in rundir
 
@@ -82,7 +118,7 @@ def run_test_phase(self, phase, rundir):
         serial_log = os.path.join(rundir, phase + "-serial.log")
 
         logger.info("Running %s_command: %s", phase, cmd)
-        runner = CommandRunner(cwd=rundir, env=self.client.config)
+        runner = self.get_runner(rundir)
         output_log_handler = LogUpdateHandler(output_log)
         live_output_handler = LiveOutputHandler(self.client, self.job_id)
         runner.register_output_handler(output_log_handler)

diff --git a/agent/testflinger_agent/masking.py b/agent/testflinger_agent/masking.py
@@ -0,0 +1,50 @@
+"""
+Text masking for sensitive information.
+
+This module provides functionality to mask sensitive information in text.
+Each matched pattern is replaced with a deterministic hash, ensuring that
+the same sensitive information is always replaced by the same mask.
+"""
+
+import hashlib
+import re
+from typing import List, Optional
+
+
+class Masker:
+    """
+    A class for masking sensitive information in text.
+
+    Example:
+        >>> masker = Masker(['\\d{3}-\\d{2}-\\d{4}'], hash_length=6)
+        >>> masker.apply("SSN: 123-45-6789")
+        'SSN: **01a546**'
+    """
+
+    def __init__(self, patterns: List[str], hash_length: Optional[int] = None):
+        # join all patterns into a single combined pattern and compile it
+        combined = self.combine(pattern for pattern in patterns if pattern)
+        if not combined:
+            raise ValueError("Empty combined pattern")
+        self.pattern = re.compile(combined)
+        # the length of the hash to be used for masking
+        # (a value of `None` results in the full hash length)
+        self.hash_length = hash_length
+
+    @staticmethod
+    def combine(patterns: List[str]) -> str:
+        """Return a single disjunctive regular expression from `patterns`"""
+        return "|".join(f"({pattern})" for pattern in patterns)
+
+    @staticmethod
+    def hash(text: str) -> str:
+        """Return a hash for `text`"""
+        return hashlib.sha256(text.encode()).hexdigest()
+
+    def mask(self, match: re.Match) -> str:
+        """Return a string with a mask applied on a pattern `match`"""
+        return f"**{self.hash(match.group())[:self.hash_length]}**"
+
+    def apply(self, text: str) -> str:
+        """Return a string with all pattern matches in `text` masked"""
+        return self.pattern.sub(self.mask, text)
diff --git a/agent/testflinger_agent/runner.py b/agent/testflinger_agent/runner.py
@@ -26,6 +26,8 @@
 from typing import Callable, List, Optional, Tuple
 
 from testflinger_common.enums import TestEvent
+from testflinger_agent.masking import Masker
+
 
 logger = logging.getLogger(__name__)
 
@@ -157,6 +159,19 @@ def run(self, cmd: str) -> Tuple[int, Optional[TestEvent], str]:
         return self.process.returncode, stop_event, stop_reason
 
 
+class MaskingCommandRunner(CommandRunner):
+    """A CommandRunner that masks sensitive information in the output"""
+
+    def __init__(self, *args, masker: Masker, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.masker = masker
+
+    def post_output(self, data: str):
+        # mask sensitive information before posting output data
+        data = self.masker.apply(data)
+        super().post_output(data)
+
+
 def get_stop_reason(returncode: int, stop_reason: str) -> str:
     """
     Try to give some reason for the job stopping based on what we know.

diff --git a/agent/testflinger_agent/schema.py b/agent/testflinger_agent/schema.py
@@ -45,6 +45,7 @@
     voluptuous.Optional("output_timeout"): int,
     voluptuous.Optional("advertised_queues"): dict,
     voluptuous.Optional("advertised_images"): dict,
+    voluptuous.Optional("sensitive_patterns"): list,
 }
 
 

diff --git a/agent/testflinger_agent/tests/test_job.py b/agent/testflinger_agent/tests/test_job.py
@@ -1,5 +1,7 @@
+import json
 import os
 import pytest
+import re
 import shutil
 import tempfile
 
@@ -15,7 +17,7 @@
     GlobalTimeoutChecker,
     OutputTimeoutChecker,
 )
-from testflinger_common.enums import TestEvent
+from testflinger_common.enums import TestEvent, TestPhase
 
 
 class TestJob:
@@ -173,6 +175,111 @@ def test_run_test_phase_with_run_exception(
         assert exit_event == "setup_fail"
         assert exit_reason == "failed"
 
+    def run_testcmds(self, job_data, client, tmp_path) -> str:
+        # create the outcome file manually
+        with open(tmp_path / "testflinger-outcome.json", "w") as outcome_file:
+            outcome_file.write("{}")
+        # create the agent configuration file manually
+        with open(tmp_path / "default.yaml", "w") as config_file:
+            config_file.write("agent_name: agent-007")
+        # write the job data to `testflinger.json` manually:
+        # (so that the device connector can pick it up)
+        with open(tmp_path / "testflinger.json", "w") as job_file:
+            json.dump(job_data, job_file)
+
+        # running the device connector will result in running the `test_cmds`
+        self.config["test_command"] = (
+            "testflinger-device-connector fake_connector runtest "
+            f"--config {tmp_path}/default.yaml "
+            f"{tmp_path}/testflinger.json"
+        )
+
+        # run the test phase of the job
+        job = _TestflingerJob(job_data, client)
+        exit_code, _, _ = job.run_test_phase(TestPhase.TEST, tmp_path)
+        assert exit_code == 0
+
+        # capture the output
+        with open(tmp_path / "test.log") as log:
+            return log.read()
+
+    def test_run_test_phase_sensitive(self, client, tmp_path, requests_mock):
+        requests_mock.post(rmock.ANY, status_code=200)
+
+        # sensitive patterns are added to the agent configuration file:
+        # this will result in the use of a masking command runner
+        self.config["sensitive_patterns"] = ["[Cc][a-z]*n"]
+
+        # create the job data
+        job_data = {
+            "test_data": {"test_cmds": "echo -n Message: Captain Major"}
+        }
+
+        log_data = self.run_testcmds(job_data, client, tmp_path)
+        print(log_data)
+
+        # the output should match this pattern, i.e. the sensitive pattern
+        # after "Message: " should be masked
+        output_pattern = r"^Message: \*\*([0-9a-f]{6})\*\* Major$"
+        match = re.search(output_pattern, log_data, re.MULTILINE)
+        assert match is not None
+
+    def test_run_test_phase_secret(self, client, tmp_path, requests_mock):
+        requests_mock.post(rmock.ANY, status_code=200)
+
+        # create the job data
+        job_data = {
+            "secrets": {"SECRET": "Major"},
+            "test_data": {
+                "test_cmds": "echo -n Message: Captain Major $SECRET"
+            },
+        }
+
+        log_data = self.run_testcmds(job_data, client, tmp_path)
+        print(log_data)
+
+        # the output should match this pattern, i.e. the secrets
+        # after "Message: " should be masked
+        output_pattern = (
+            r"^Message: Captain \*\*([0-9a-f]{6})\*\* \*\*([0-9a-f]{6})\*\*$"
+        )
+        match = re.search(output_pattern, log_data, re.MULTILINE)
+        assert match is not None
+        first, second = match.groups()
+        assert first == second
+
+    def test_run_test_phase_sensitive_and_secret(
+        self, client, tmp_path, requests_mock
+    ):
+        requests_mock.post(rmock.ANY, status_code=200)
+
+        # sensitive patterns are added to the agent configuration file:
+        self.config["sensitive_patterns"] = ["[Cc][a-z]*n"]
+
+        # create the job data
+        job_data = {
+            "secrets": {"SECRET": "Major"},
+            "test_data": {
+                "test_cmds": "echo -n Message: Captain Major $SECRET"
+            },
+        }
+
+        log_data = self.run_testcmds(job_data, client, tmp_path)
+        print(log_data)
+
+        # the output should match this pattern, i.e. the strings
+        # after "Message: " should be masked
+        output_pattern = (
+            r"^Message: "
+            r"\*\*([0-9a-f]{6})\*\* "
+            r"\*\*([0-9a-f]{6})\*\* "
+            r"\*\*([0-9a-f]{6})\*\*$"
+        )
+        match = re.search(output_pattern, log_data, re.MULTILINE)
+        assert match is not None
+        first, second, third = match.groups()
+        assert first != second and second == third
+
     def test_set_truncate(self, client):
         """Test the _set_truncate method of TestflingerJob"""
         job = _TestflingerJob({}, client)

diff --git a/agent/testflinger_agent/tests/test_masker.py b/agent/testflinger_agent/tests/test_masker.py
@@ -0,0 +1,72 @@
+from testflinger_agent.masking import Masker
+
+
+def test_hash_consistency():
+    text = "123 testflinger-456 no-secret"
+    # hash same text twice and compare hashes
+    hash1 = Masker.hash(text)
+    hash2 = Masker.hash(text)
+    assert hash1 == hash2
+    # expected hash length is the full SHA-256 hash, i.e. 64 characters
+    assert len(hash1) == 64
+
+
+def test_no_matches():
+    # masker for numerical values
+    masker = Masker([r"\d+"])
+    # input text has no numerical values
+    text = "testflinger"
+    # there should be no masking
+    masked = masker.apply(text)
+    print(masked)
+    assert masked == text
+
+
+def test_no_hash_length():
+    # masker for numerical values, no hash length
+    masker = Masker([r"\d+"])
+    # input text is numerical
+    text = "123456"
+    masked = masker.apply(text)
+    print(masked)
+    # expected hash length is the full SHA-256 hash, i.e. 64 characters
+    assert len(masked) == 64 + 4
+
+
+def test_mask_format():
+    hash_length = 12
+    text = "123456"
+    masker = Masker([r"\d+"], hash_length=hash_length)
+    masked = masker.apply(text)
+    print(masked)
+    assert masked.startswith("**")
+    assert masked.endswith("**")
+    assert len(masked) == hash_length + 4
+
+
+def test_apply_with_single_pattern():
+    hash_length = 12
+    text = "SSN: 123-45-6789"
+    masker = Masker([r"\b\d{3}-\d{2}-\d{4}\b"], hash_length=hash_length)
+    masked = masker.apply(text)
+    print(masked)
+    assert "123-45-6789" not in masked
+    assert masked.startswith("SSN: **")
+    assert masked.endswith("**")
+
+
+def test_apply_with_multiple_matches():
+    hash_length = 12
+    text = "Contact: john@example.com, SSN: 123-45-6789"
+    masker = Masker(
+        [
+            r"\b\d{3}-\d{2}-\d{4}\b",
+            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
+        ],
+        hash_length=hash_length,
+    )
+    masked = masker.apply(text)
+    print(masked)
+    assert "john@example.com" not in masked
+    assert "123-45-6789" not in masked
+    assert masked.count("**") == 4