Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CERTTF-388] [CERTTF-443] Implement masking on the agent #404

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
40 changes: 38 additions & 2 deletions agent/testflinger_agent/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import time

from testflinger_agent.errors import TFServerError
from .runner import CommandRunner, RunnerEvents
from testflinger_agent.masking import Masker
from .runner import CommandRunner, MaskingCommandRunner, RunnerEvents
from .handlers import LiveOutputHandler, LogUpdateHandler
from .stop_condition_checkers import (
JobCancelledChecker,
Expand All @@ -43,6 +44,41 @@ def __init__(self, job_data, client):
self.job_id = job_data.get("job_id")
self.phase = "unknown"

def get_runner(self, rundir):
hash_length = 6
# retrieve sensitive patterns from the agent configuration
sensitive_patterns = self.client.config.get("sensitive_patterns", [])
# retrieve secrets from the job
secrets_dict = self.job_data.get("secrets")
Comment on lines +49 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the logic is self-explanatory, no need for comments here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must say that in this particular case the comments were deliberate: I mean to emphasize how the sources of these patterns (sensitive vs. secrets) are different, i.e. that the former is static and comes from the agent configuration whereas the latter is dynamic and differs for every job. I believe one can easily miss that by skimming over the code.


if secrets_dict:
# inject secrets into the runner environment
environment = {
**self.client.config,
**secrets_dict,
}
# add secrets to the sensitive variables
sensitive_patterns.extend(secrets_dict.values())
# secrets and (possibly) sensitive information
Comment on lines +55 to +62
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

masker = Masker(
patterns=sensitive_patterns, hash_length=hash_length
)
return MaskingCommandRunner(
cwd=rundir, env=environment, masker=masker
)

if sensitive_patterns:
# sensitive patterns but no sensitive variables and no secrets
masker = Masker(
patterns=sensitive_patterns, hash_length=hash_length
)
return MaskingCommandRunner(
cwd=rundir, env=self.client.config, masker=masker
)

# no secrets, no sensitive information: use regular runner
return CommandRunner(cwd=rundir, env=self.client.config)

def run_test_phase(self, phase, rundir):
"""Run the specified test phase in rundir

Expand Down Expand Up @@ -82,7 +118,7 @@ def run_test_phase(self, phase, rundir):
serial_log = os.path.join(rundir, phase + "-serial.log")

logger.info("Running %s_command: %s", phase, cmd)
runner = CommandRunner(cwd=rundir, env=self.client.config)
runner = self.get_runner(rundir)
output_log_handler = LogUpdateHandler(output_log)
live_output_handler = LiveOutputHandler(self.client, self.job_id)
runner.register_output_handler(output_log_handler)
Expand Down
50 changes: 50 additions & 0 deletions agent/testflinger_agent/masking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Text masking for sensitive information.

This module provides functionality to mask sensitive information in text.
Each matched pattern is replaced with a deterministic hash, ensuring that
the same sensitive information is always replaced by the same mask.
"""

import hashlib
import re
from typing import List, Optional


class Masker:
"""
A class for masking sensitive information in text.

Example:
>>> masker = Masker(['\\d{3}-\\d{2}-\\d{4}'], hash_length=6)
>>> masker.apply("SSN: 123-45-6789")
'SSN: **01a546**'
"""

def __init__(self, patterns: List[str], hash_length: Optional[int] = None):
# join all patterns into a single combined pattern and compile it
combined = self.combine(pattern for pattern in patterns if pattern)
if not combined:
raise ValueError("Empty combined pattern")
self.pattern = re.compile(combined)
# the length of the hash to be used for masking
# (a value of `None` results in the full hash length)
self.hash_length = hash_length

@staticmethod
def combine(patterns: List[str]) -> str:
"""Return a single disjunctive regular expression from `patterns`"""
return "|".join(f"({pattern})" for pattern in patterns)

@staticmethod
def hash(text: str) -> str:
"""Return a hash for `text`"""
return hashlib.sha256(text.encode()).hexdigest()

def mask(self, match: re.Match) -> str:
"""Return a string with a mask applied on a pattern `match`"""
return f"**{self.hash(match.group())[:self.hash_length]}**"

def apply(self, text: str) -> str:
"""Return a string with all pattern matches in `text` masked"""
return self.pattern.sub(self.mask, text)
15 changes: 15 additions & 0 deletions agent/testflinger_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from typing import Callable, List, Optional, Tuple

from testflinger_common.enums import TestEvent
from testflinger_agent.masking import Masker


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -157,6 +159,19 @@ def run(self, cmd: str) -> Tuple[int, Optional[TestEvent], str]:
return self.process.returncode, stop_event, stop_reason


class MaskingCommandRunner(CommandRunner):
"""A CommandRunner that masks sensitive information in the output"""

def __init__(self, *args, masker: Masker, **kwargs):
super().__init__(*args, **kwargs)
self.masker = masker

def post_output(self, data: str):
# mask sensitive information before posting output data
data = self.masker.apply(data)
super().post_output(data)


def get_stop_reason(returncode: int, stop_reason: str) -> str:
"""
Try to give some reason for the job stopping based on what we know.
Expand Down
1 change: 1 addition & 0 deletions agent/testflinger_agent/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
voluptuous.Optional("output_timeout"): int,
voluptuous.Optional("advertised_queues"): dict,
voluptuous.Optional("advertised_images"): dict,
voluptuous.Optional("sensitive_patterns"): list,
}


Expand Down
109 changes: 108 additions & 1 deletion agent/testflinger_agent/tests/test_job.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os
import pytest
import re
import shutil
import tempfile

Expand All @@ -15,7 +17,7 @@
GlobalTimeoutChecker,
OutputTimeoutChecker,
)
from testflinger_common.enums import TestEvent
from testflinger_common.enums import TestEvent, TestPhase


class TestJob:
Expand Down Expand Up @@ -173,6 +175,111 @@ def test_run_test_phase_with_run_exception(
assert exit_event == "setup_fail"
assert exit_reason == "failed"

def run_testcmds(self, job_data, client, tmp_path) -> str:
# create the outcome file manually
with open(tmp_path / "testflinger-outcome.json", "w") as outcome_file:
outcome_file.write("{}")
# create the agent configuration file manually
with open(tmp_path / "default.yaml", "w") as config_file:
config_file.write("agent_name: agent-007")
# write the job data to `testflinger.json` manually:
# (so that the device connector can pick it up)
with open(tmp_path / "testflinger.json", "w") as job_file:
json.dump(job_data, job_file)

# running the device connector will result in running the `test_cmds`
self.config["test_command"] = (
"testflinger-device-connector fake_connector runtest "
f"--config {tmp_path}/default.yaml "
f"{tmp_path}/testflinger.json"
)

# run the test phase of the job
job = _TestflingerJob(job_data, client)
exit_code, _, _ = job.run_test_phase(TestPhase.TEST, tmp_path)
assert exit_code == 0

# capture the output
with open(tmp_path / "test.log") as log:
return log.read()

def test_run_test_phase_sensitive(self, client, tmp_path, requests_mock):
requests_mock.post(rmock.ANY, status_code=200)

# sensitive patterns are added to the agent configuration file:
# this will result in the use of a masking command runner
self.config["sensitive_patterns"] = ["[Cc][a-z]*n"]

# create the job data
job_data = {
"test_data": {"test_cmds": "echo -n Message: Captain Major"}
}

log_data = self.run_testcmds(job_data, client, tmp_path)
print(log_data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are you supposed to print log data to stdout, not stderr?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a test, so neither stdout nor stderr are printed when the test is successful. They are however both captured and printed (separately) when the test fails, so this print is meant as a diagnostic, i.e. you only see it when the test fails and might help you understand what went wrong. I'd happily change this to print to stderr if you feel it makes any difference.


# the output should match this pattern, i.e. the sensitive pattern
# after "Message: " should be masked
output_pattern = r"^Message: \*\*([0-9a-f]{6})\*\* Major$"
match = re.search(output_pattern, log_data, re.MULTILINE)
assert match is not None

def test_run_test_phase_secret(self, client, tmp_path, requests_mock):
requests_mock.post(rmock.ANY, status_code=200)

# create the job data
job_data = {
"secrets": {"SECRET": "Major"},
"test_data": {
"test_cmds": "echo -n Message: Captain Major $SECRET"
},
}

log_data = self.run_testcmds(job_data, client, tmp_path)
print(log_data)

# the output should match this pattern, i.e. the secrets
# after "Message: " should be masked
output_pattern = (
r"^Message: Captain \*\*([0-9a-f]{6})\*\* \*\*([0-9a-f]{6})\*\*$"
)
match = re.search(output_pattern, log_data, re.MULTILINE)
assert match is not None
first, second = match.groups()
assert first == second

def test_run_test_phase_sensitive_and_secret(
self, client, tmp_path, requests_mock
):
requests_mock.post(rmock.ANY, status_code=200)

# sensitive patterns are added to the agent configuration file:
self.config["sensitive_patterns"] = ["[Cc][a-z]*n"]

# create the job data
job_data = {
"secrets": {"SECRET": "Major"},
"test_data": {
"test_cmds": "echo -n Message: Captain Major $SECRET"
},
}

log_data = self.run_testcmds(job_data, client, tmp_path)
print(log_data)

# the output should match this pattern, i.e. the strings
# after "Message: " should be masked
output_pattern = (
r"^Message: "
r"\*\*([0-9a-f]{6})\*\* "
r"\*\*([0-9a-f]{6})\*\* "
r"\*\*([0-9a-f]{6})\*\*$"
)
match = re.search(output_pattern, log_data, re.MULTILINE)
assert match is not None
first, second, third = match.groups()
assert first != second and second == third

def test_set_truncate(self, client):
"""Test the _set_truncate method of TestflingerJob"""
job = _TestflingerJob({}, client)
Expand Down
72 changes: 72 additions & 0 deletions agent/testflinger_agent/tests/test_masker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from testflinger_agent.masking import Masker


def test_hash_consistency():
text = "123 testflinger-456 no-secret"
# hash same text twice and compare hashes
hash1 = Masker.hash(text)
hash2 = Masker.hash(text)
assert hash1 == hash2
# expected hash length is the full SHA-256 hash, i.e. 64 characters
assert len(hash1) == 64


def test_no_matches():
# masker for numerical values
masker = Masker([r"\d+"])
# input text has no numerical values
text = "testflinger"
# there should be no masking
masked = masker.apply(text)
print(masked)
assert masked == text


def test_no_hash_length():
# masker for numerical values, no hash length
masker = Masker([r"\d+"])
# input text is numerical
text = "123456"
masked = masker.apply(text)
print(masked)
# expected hash length is the full SHA-256 hash, i.e. 64 characters
assert len(masked) == 64 + 4


def test_mask_format():
hash_length = 12
text = "123456"
masker = Masker([r"\d+"], hash_length=hash_length)
masked = masker.apply(text)
print(masked)
assert masked.startswith("**")
assert masked.endswith("**")
assert len(masked) == hash_length + 4


def test_apply_with_single_pattern():
hash_length = 12
text = "SSN: 123-45-6789"
masker = Masker([r"\b\d{3}-\d{2}-\d{4}\b"], hash_length=hash_length)
masked = masker.apply(text)
print(masked)
assert "123-45-6789" not in masked
assert masked.startswith("SSN: **")
assert masked.endswith("**")


def test_apply_with_multiple_matches():
hash_length = 12
text = "Contact: john@example.com, SSN: 123-45-6789"
masker = Masker(
[
r"\b\d{3}-\d{2}-\d{4}\b",
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
],
hash_length=hash_length,
)
masked = masker.apply(text)
print(masked)
assert "john@example.com" not in masked
assert "123-45-6789" not in masked
assert masked.count("**") == 4
Loading