Skip to content

Commit

Permalink
remove unnecessary updates to gke tpu job for pathways workloads
Browse files Browse the repository at this point in the history
  • Loading branch information
jesus-orozco committed Jan 14, 2025
1 parent ca5c883 commit c7bc5df
Showing 1 changed file with 5 additions and 18 deletions.
23 changes: 5 additions & 18 deletions axlearn/cloud/gcp/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,7 @@ def _build_container(self, job_type: str = None) -> Nested[Any]:
]

if self.using_pathways:
staging_location = f"{cfg.output_dir}/pathways-staging/tmp"
env_vars.update(
# dump XLA flags to GCS bucket for troubleshooting purposes.
XLA_FLAGS="--xla_dump_to=gs://ttl-30d-us-central2/axlearn/users/jesusfc/pathways/v6e/xla/"
)

staging_location = f"{cfg.output_dir}/pathways-staging"
if job_type == "pathways-head":
env_vars.update(
# JAX_BACKEND_TARGET="grpc://$(HOST_ADDRESS):29000",
Expand Down Expand Up @@ -851,8 +846,7 @@ def _build_pod(self, job_type: str = None) -> Nested[Any]:
labels.update({"bastion-tier": "reserved"})
else:
logging.info("Found tier=%s in env. Using spot quota", tier)
# Comment out selector when running against internal v6e test project
# selector.update({"cloud.google.com/gke-spot": "true"})
selector.update({"cloud.google.com/gke-spot": "true"})
tolerations.append(
{
"key": "cloud.google.com/gke-spot",
Expand Down Expand Up @@ -893,7 +887,7 @@ def _build_pod(self, job_type: str = None) -> Nested[Any]:
# the original jobset attempts to restart (node pool conflict). This is more
# reliable at the moment but doesn't take advantage of node pool sharing. GCP is
# working on a fix.
# "provisioner-nodepool-id": cfg.name,
"provisioner-nodepool-id": cfg.name,
}
)

Expand Down Expand Up @@ -933,7 +927,7 @@ def _build_pod(self, job_type: str = None) -> Nested[Any]:
)

if job_type == "pathways-head":
# selector.update({"pathways-head": "true"})
# Target a specific CPU nodepool for Pathways containers
selector.update({"cloud.google.com/gke-nodepool": "pathways-head"})
initContainers.extend(self._build_pathways_containers())
else:
Expand Down Expand Up @@ -965,6 +959,7 @@ def _build_pod(self, job_type: str = None) -> Nested[Any]:
initContainers=initContainers,
serviceAccountName=cfg.service_account,
volumes=volumes,
# Enable host network for optimal performance with Pathways
hostNetwork=True if self.using_pathways else False,
dnsPolicy="ClusterFirstWithHostNet" if self.using_pathways else None,
)
Expand Down Expand Up @@ -999,9 +994,6 @@ def _build_job(self, job_type: str = None) -> Nested[Any]:
template=self._build_pod(job_type),
)
elif job_type == "pathways-head":
#annotations.update(
# {"alpha.jobset.sigs.k8s.io/exclusive-topology": "kubernetes.io/hostname"}
#)
spec.update(
parallelism=1,
completions=1,
Expand Down Expand Up @@ -1108,11 +1100,6 @@ def _execute(self) -> Any:
kind="JobSet",
**self._build_jobset(),
)
with open(f"jobsets/{cfg.name}.yaml", "w") as f:
logging.info("Output jobset to yaml file...")
import yaml

yaml.dump(custom_object, f, default_flow_style=False)
logging.info("Submitting JobSet body=%s api_kwargs=%s", custom_object, api_kwargs)
return k8s.client.CustomObjectsApi().create_namespaced_custom_object(
namespace=cfg.namespace,
Expand Down

0 comments on commit c7bc5df

Please sign in to comment.