Skip to content

Commit

Permalink
fix: failure recovery (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
yanksyoon authored Jan 4, 2024
1 parent db7b417 commit c5ac369
Show file tree
Hide file tree
Showing 12 changed files with 375 additions and 60 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/integration_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ on:
jobs:
integration-tests:
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
with:
pre-run-script: tests/integration/pre_run_script.sh
secrets: inherit
with:
pre-run-script: |
-c "sudo microk8s config > ${GITHUB_WORKSPACE}/kube-config
chmod +x tests/integration/pre_run_script.sh
./tests/integration/pre_run_script.sh"
extra-arguments: |
--kube-config ${GITHUB_WORKSPACE}/kube-config
modules: '["test_agent_k8s.py", "test_agent_machine.py"]'
2 changes: 1 addition & 1 deletion jenkins_agent_k8s_rock/files/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ touch "${JENKINS_HOME}/agents/.ready"

# Start Jenkins agent
echo "${JENKINS_AGENT}"
${JAVA} -jar ${AGENT_JAR} -jnlpUrl "${JENKINS_URL}/computer/${JENKINS_AGENT}/slave-agent.jnlp" -workDir "${JENKINS_HOME}" -noReconnect -secret "${JENKINS_TOKEN}" || echo "Invalid or already used credentials."
${JAVA} -jar ${AGENT_JAR} -jnlpUrl "${JENKINS_URL}/computer/${JENKINS_AGENT}/jenkins-agent.jnlp" -workDir "${JENKINS_HOME}" -noReconnect -secret "${JENKINS_TOKEN}" || echo "Invalid or already used credentials."

# Remove ready mark if unsuccessful
rm ${JENKINS_HOME}/agents/.ready
30 changes: 30 additions & 0 deletions src-docs/agent.py.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,34 @@ Shortcut for more simple access the model.



---

<a href="../src/agent.py#L199"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>

### <kbd>function</kbd> `start_agent_from_relation`

```python
start_agent_from_relation(
container: Container,
credentials: Credentials,
agent_name: str
) → None
```

Start agent from agent relation.



**Args:**

- <b>`container`</b>: The Jenkins agent workload container.
- <b>`credentials`</b>: The agent registration details for jenkins server.
- <b>`agent_name`</b>: The jenkins agent to register as.



**Raises:**

- <b>`AgentJarDownloadError`</b>: if the agent jar executable failed to download.


49 changes: 29 additions & 20 deletions src/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _on_slave_relation_changed(self, event: ops.RelationChangedEvent) -> None:
event: The event fired when slave relation data has changed.
Raises:
RuntimeError: if the Jenkins agent failed to download.
AgentJarDownloadError: if the Jenkins agent failed to download.
"""
logger.info("%s relation changed.", event.relation.name)

Expand Down Expand Up @@ -133,7 +133,7 @@ def _on_slave_relation_changed(self, event: ops.RelationChangedEvent) -> None:
)
except server.AgentJarDownloadError as exc:
logger.error("Failed to download Jenkins agent executable, %s", exc)
raise RuntimeError("Failed to download Jenkins agent.") from exc
raise

self.charm.unit.status = ops.MaintenanceStatus("Validating credentials.")
if not server.validate_credentials(
Expand All @@ -152,25 +152,17 @@ def _on_slave_relation_changed(self, event: ops.RelationChangedEvent) -> None:
self.charm.unit.status = ops.WaitingStatus("Waiting for credentials.")
return

self.charm.unit.status = ops.MaintenanceStatus("Starting agent pebble service.")
self.pebble_service.reconcile(
server_url=self.state.slave_relation_credentials.address,
agent_token_pair=(
self.state.agent_meta.name,
self.state.slave_relation_credentials.secret,
),
self.start_agent_from_relation(
container=container,
credentials=self.state.slave_relation_credentials,
agent_name=self.state.agent_meta.name,
)
self.charm.unit.status = ops.ActiveStatus()

def _on_agent_relation_changed(self, event: ops.RelationChangedEvent) -> None:
"""Handle agent relation changed event.
Args:
event: The event fired when the agent relation data has changed.
Raises:
RuntimeError: if the Jenkins agent failed to download.
"""
logger.info("%s relation changed.", event.relation.name)

Expand Down Expand Up @@ -198,21 +190,38 @@ def _on_agent_relation_changed(self, event: ops.RelationChangedEvent) -> None:
event.defer()
return

self.start_agent_from_relation(
container=container,
credentials=self.state.agent_relation_credentials,
agent_name=self.state.agent_meta.name,
)

def start_agent_from_relation(
self, container: ops.Container, credentials: server.Credentials, agent_name: str
) -> None:
"""Start agent from agent relation.
Args:
container: The Jenkins agent workload container.
credentials: The agent registration details for jenkins server.
agent_name: The jenkins agent to register as.
Raises:
AgentJarDownloadError: if the agent jar executable failed to download.
"""
self.charm.unit.status = ops.MaintenanceStatus("Downloading Jenkins agent executable.")
try:
server.download_jenkins_agent(
server_url=self.state.agent_relation_credentials.address, container=container
)
server.download_jenkins_agent(server_url=credentials.address, container=container)
except server.AgentJarDownloadError as exc:
logger.error("Failed to download Jenkins agent executable, %s", exc)
raise RuntimeError("Failed to download Jenkins agent.") from exc
raise server.AgentJarDownloadError("Failed to download Jenkins agent.") from exc

self.charm.unit.status = ops.MaintenanceStatus("Starting agent pebble service.")
self.pebble_service.reconcile(
server_url=self.state.agent_relation_credentials.address,
server_url=credentials.address,
agent_token_pair=(
self.state.agent_meta.name,
self.state.agent_relation_credentials.secret,
agent_name,
credentials.secret,
),
container=container,
)
Expand Down
27 changes: 25 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def __init__(self, *args: typing.Any):
self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.upgrade_charm, self._on_upgrade_charm)

self.framework.observe(
self.on.jenkins_k8s_agent_pebble_ready, self._on_jenkins_k8s_agent_pebble_ready
)

def _register_via_config(
self, event: typing.Union[ops.ConfigChangedEvent, ops.UpgradeCharmEvent]
) -> None:
Expand All @@ -50,7 +54,7 @@ def _register_via_config(
event: The event fired on config changed or upgrade charm.
Raises:
RuntimeError: if the Jenkins agent failed to download.
AgentJarDownloadError: if the Jenkins agent failed to download.
"""
container = self.unit.get_container(self.state.jenkins_agent_service_name)
if not container.can_connect():
Expand Down Expand Up @@ -85,7 +89,7 @@ def _register_via_config(
)
except server.AgentJarDownloadError as exc:
logger.error("Failed to download Agent JAR executable, %s", exc)
raise RuntimeError("Failed to download Jenkins agent. Fix issue ") from exc
raise

valid_agent_token = server.find_valid_credentials(
agent_name_token_pairs=self.state.jenkins_config.agent_name_token_pairs,
Expand Down Expand Up @@ -123,6 +127,25 @@ def _on_upgrade_charm(self, event: ops.UpgradeCharmEvent) -> None:
"""
self._register_via_config(event)

def _on_jenkins_k8s_agent_pebble_ready(self, _: ops.PebbleReadyEvent) -> None:
"""Handle pebble ready event.
Pebble ready is fired
1. during initial charm launch.
2. when the container has restarted for various reasons.
It is necessary to handle case 2 for recovery cases.
"""
container = self.unit.get_container(self.state.jenkins_agent_service_name)
if not container.can_connect() or not self.state.agent_relation_credentials:
logger.warning("Preconditions not ready.")
return

self.agent_observer.start_agent_from_relation(
container=container,
credentials=self.state.agent_relation_credentials,
agent_name=self.state.agent_meta.name,
)


if __name__ == "__main__": # pragma: no cover
main(JenkinsAgentCharm)
2 changes: 2 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ def pytest_addoption(parser: pytest.Parser):
parser.addoption("--jenkins-agent-k8s-image", action="store", default="")
# The prebuilt charm file.
parser.addoption("--charm-file", action="store", default="")
# The path to kubernetes config.
parser.addoption("--kube-config", action="store", default="~/.kube/config")
87 changes: 78 additions & 9 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import typing

import jenkinsapi.jenkins
import kubernetes
import pytest
import pytest_asyncio
from juju.action import Action
from juju.application import Application
from juju.client._definitions import FullStatus, UnitStatus
from juju.model import Controller, Model
from juju.unit import Unit
from pytest import FixtureRequest
from pytest_operator.plugin import OpsTest

logger = logging.getLogger(__name__)
Expand All @@ -39,6 +41,23 @@ def model_fixture(ops_test: OpsTest) -> Model:
return ops_test.model


@pytest.fixture(scope="module", name="kube_config")
def kube_config_fixture(request: FixtureRequest) -> str:
"""The kubernetes config file path."""
kube_config = request.config.getoption("--kube-config")
assert (
kube_config
), "--kube-confg argument is required which should contain the path to kube config."
return kube_config


@pytest.fixture(scope="module", name="kube_core_client")
def kube_core_client_fixture(kube_config: str) -> kubernetes.client.CoreV1Api:
"""Create a kubernetes client for core v1 API."""
kubernetes.config.load_kube_config(config_file=kube_config)
return kubernetes.client.CoreV1Api()


@pytest.fixture(scope="module", name="agent_image")
def agent_image_fixture(request: pytest.FixtureRequest) -> str:
"""The OCI image for jenkins-agent-k8s charm."""
Expand Down Expand Up @@ -108,8 +127,10 @@ async def jenkins_machine_server_fixture(machine_model: Model) -> Application:
return app


@pytest_asyncio.fixture(scope="module", name="server_unit_ip")
async def server_unit_ip_fixture(machine_model: Model, jenkins_machine_server: Application):
@pytest_asyncio.fixture(scope="module", name="machine_server_unit_ip")
async def machine_server_unit_ip_fixture(
machine_model: Model, jenkins_machine_server: Application
):
"""Get Jenkins machine server charm unit IP."""
status: FullStatus = await machine_model.get_status([jenkins_machine_server.name])
try:
Expand All @@ -122,16 +143,16 @@ async def server_unit_ip_fixture(machine_model: Model, jenkins_machine_server: A
raise StopIteration("Invalid unit status") from exc


@pytest_asyncio.fixture(scope="module", name="web_address")
async def web_address_fixture(server_unit_ip: str):
@pytest_asyncio.fixture(scope="module", name="machine_web_address")
async def machine_web_address_fixture(machine_server_unit_ip: str):
"""Get Jenkins machine server charm web address."""
return f"http://{server_unit_ip}:8080"
return f"http://{machine_server_unit_ip}:8080"


@pytest_asyncio.fixture(scope="module", name="jenkins_client")
async def jenkins_client_fixture(
@pytest_asyncio.fixture(scope="module", name="machine_jenkins_client")
async def machine_jenkins_client_fixture(
jenkins_machine_server: Application,
web_address: str,
machine_web_address: str,
) -> jenkinsapi.jenkins.Jenkins:
"""The Jenkins API client."""
jenkins_unit: Unit = jenkins_machine_server.units[0]
Expand All @@ -143,5 +164,53 @@ async def jenkins_client_fixture(
# Initialization of the jenkins client will raise an exception if unable to connect to the
# server.
return jenkinsapi.jenkins.Jenkins(
baseurl=web_address, username="admin", password=password, timeout=60
baseurl=machine_web_address, username="admin", password=password, timeout=60
)


@pytest_asyncio.fixture(scope="module", name="jenkins_k8s_server")
async def jenkins_k8s_server_fixture(model: Model) -> Application:
"""The jenkins k8s server."""
app = await model.deploy("jenkins-k8s", series="jammy", channel="latest/edge")
await model.wait_for_idle(apps=[app.name], timeout=1200, raise_on_error=False)

return app


@pytest_asyncio.fixture(scope="module", name="k8s_server_unit_ip")
async def k8s_server_unit_ip_fixture(model: Model, jenkins_k8s_server: Application):
"""Get Jenkins k8s server charm unit IP."""
status: FullStatus = await model.get_status([jenkins_k8s_server.name])
try:
unit_status: UnitStatus = next(
iter(status.applications[jenkins_k8s_server.name].units.values())
)
assert unit_status.address, "Invalid unit address"
return unit_status.address
except StopIteration as exc:
raise StopIteration("Invalid unit status") from exc


@pytest_asyncio.fixture(scope="module", name="k8s_web_address")
async def k8s_web_address_fixture(k8s_server_unit_ip: str):
"""Get Jenkins k8s server charm web address."""
return f"http://{k8s_server_unit_ip}:8080"


@pytest_asyncio.fixture(scope="module", name="jenkins_client")
async def jenkins_client_fixture(
jenkins_k8s_server: Application,
k8s_web_address: str,
) -> jenkinsapi.jenkins.Jenkins:
"""The Jenkins API client."""
jenkins_unit: Unit = jenkins_k8s_server.units[0]
action: Action = await jenkins_unit.run_action("get-admin-password")
await action.wait()
assert action.status == "completed", "Failed to get credentials."
password = action.results["password"]

# Initialization of the jenkins client will raise an exception if unable to connect to the
# server.
return jenkinsapi.jenkins.Jenkins(
baseurl=k8s_web_address, username="admin", password=password, timeout=60
)
47 changes: 47 additions & 0 deletions tests/integration/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.

"""Helpers for Jenkins-agent-k8s-operator charm integration tests."""
import asyncio
import inspect
import time
import typing


async def wait_for(
func: typing.Callable[[], typing.Union[typing.Awaitable, typing.Any]],
timeout: int = 300,
check_interval: int = 10,
) -> typing.Any:
"""Wait for function execution to become truthy.
Args:
func: A callback function to wait to return a truthy value.
timeout: Time in seconds to wait for function result to become truthy.
check_interval: Time in seconds to wait between ready checks.
Raises:
TimeoutError: if the callback function did not return a truthy value within timeout.
Returns:
The result of the function if any.
"""
deadline = time.time() + timeout
is_awaitable = inspect.iscoroutinefunction(func)
while time.time() < deadline:
if is_awaitable:
if result := await func():
return result
else:
if result := func():
return result
await asyncio.sleep(check_interval)

# final check before raising TimeoutError.
if is_awaitable:
if result := await func():
return result
else:
if result := func():
return result
raise TimeoutError()
Loading

0 comments on commit c5ac369

Please sign in to comment.