From 11fe336d2941b255545ee1a37264085b9f5bec76 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:25:08 +0900 Subject: [PATCH 01/76] revise SGE - add option: "strategy": {"customized_script_header_template_file": ""}, - add option: `sge_pe_name` --- dpdispatcher/machines/pbs.py | 56 ++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index a94cd2a1..5581c529 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -189,34 +189,46 @@ def gen_script_header(self, job): class SGE(PBS): - def __init__( - self, - batch_type=None, - context_type=None, - local_root=None, - remote_root=None, - remote_profile={}, - *, - context=None, - ): - super(PBS, self).__init__( - batch_type, - context_type, - local_root, - remote_root, - remote_profile, - context=context, - ) + def __init__(self, + batch_type=None, + context_type=None, + local_root=None, + remote_root=None, + remote_profile={}, + *, + context=None): + super(PBS, self).__init__(batch_type, + context_type, + local_root, + remote_root, + remote_profile, + context=context) def gen_script_header(self, job): + ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml + # resources.number_node is not used in SGE resources = job.resources sge_script_header_dict = {} - # resources.number_node is not used + if resources.sge_pe_name != '': + sge_pe_name = resources.sge_pe_name + else: + sge_pe_name = "mpi" sge_script_header_dict["select_node_line"] = ( - f"#$ -pe mpi {resources.cpu_per_node} " + f"#$ -pe {sge_pe_name} {resources.cpu_per_node} " ) - # resources.queue_name is not necessary - sge_script_header = sge_script_header_template.format(**sge_script_header_dict) + if resources.queue_name != '': + sge_script_header_dict["select_node_line"] += ( + f"#$ -q {resources.queue_name}" + ) + + if (resources["strategy"].get("customized_script_header_template_file") + is not None): + sge_script_header = customized_script_header_template( + resources["strategy"]["customized_script_header_template_file"], + resources,) + else: + sge_script_header = sge_script_header_template.format( + **sge_script_header_dict) return sge_script_header def do_submit(self, job): From 8140a43572167bcdd6c385db705e5f2816aaae21 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:06:01 +0900 Subject: [PATCH 02/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 5581c529..66d1d468 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -196,13 +196,15 @@ def __init__(self, remote_root=None, remote_profile={}, *, - context=None): + context=None, + sge_pe_name = '',): super(PBS, self).__init__(batch_type, context_type, local_root, remote_root, remote_profile, - context=context) + context=context, + sge_pe_name = sge_pe_name) def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml From b893544e7e5ba7ddfa2c9e0882a5f1b19e3ffbed Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:09:46 +0900 Subject: [PATCH 03/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 66d1d468..93e870f5 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -196,15 +196,13 @@ def __init__(self, remote_root=None, remote_profile={}, *, - context=None, - sge_pe_name = '',): + context=None,): super(PBS, self).__init__(batch_type, context_type, local_root, remote_root, remote_profile, - context=context, - sge_pe_name = sge_pe_name) + context=context,) def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml From 82b72623e0974029d2f00bf0bce6ed3a3bae95d1 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:28:28 +0900 Subject: [PATCH 04/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 93e870f5..31fa46c7 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -209,10 +209,11 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources sge_script_header_dict = {} - if resources.sge_pe_name != '': - sge_pe_name = resources.sge_pe_name - else: - sge_pe_name = "mpi" + # if resources.sge_pe_name != '': + # sge_pe_name = resources.sge_pe_name + # else: + # sge_pe_name = "mpi" + sge_pe_name = "mpi" sge_script_header_dict["select_node_line"] = ( f"#$ -pe {sge_pe_name} {resources.cpu_per_node} " ) From 01ffd14199904b35a64dd47de54792435a7320df Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:56:33 +0900 Subject: [PATCH 05/76] add sge_qe_name --- dpdispatcher/machines/pbs.py | 9 ++++----- dpdispatcher/submission.py | 9 ++++++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 31fa46c7..93e870f5 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -209,11 +209,10 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources sge_script_header_dict = {} - # if resources.sge_pe_name != '': - # sge_pe_name = resources.sge_pe_name - # else: - # sge_pe_name = "mpi" - sge_pe_name = "mpi" + if resources.sge_pe_name != '': + sge_pe_name = resources.sge_pe_name + else: + sge_pe_name = "mpi" sge_script_header_dict["select_node_line"] = ( f"#$ -pe {sge_pe_name} {resources.cpu_per_node} " ) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 9c5c73ca..41b5dee0 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -473,7 +473,7 @@ def generate_jobs(self): random_task_index = list(range(task_num)) random.shuffle(random_task_index) random_task_index_ll = [ - random_task_index[ii : ii + group_size] + random_task_index[ii: ii + group_size] for ii in range(0, task_num, group_size) ] @@ -981,6 +981,8 @@ class Resources: The env file to be sourced before the command execution. wait_time : int The waitting time in second after a single task submitted. Default: 0. + sge_pe_name : str + The parallel environment name of SGE. """ def __init__( @@ -1002,6 +1004,7 @@ def __init__( prepend_script=[], append_script=[], wait_time=0, + sge_pe_name="", **kwargs, ): self.number_node = number_node @@ -1022,6 +1025,7 @@ def __init__( self.prepend_script = prepend_script self.append_script = append_script self.wait_time = wait_time + self.sge_pe_name = sge_pe_name # self.if_cuda_multi_devices = if_cuda_multi_devices self.kwargs = kwargs.get("kwargs", kwargs) @@ -1068,6 +1072,7 @@ def serialize(self): resources_dict["prepend_script"] = self.prepend_script resources_dict["append_script"] = self.append_script resources_dict["wait_time"] = self.wait_time + resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["kwargs"] = self.kwargs return resources_dict @@ -1090,6 +1095,7 @@ def deserialize(cls, resources_dict): prepend_script=resources_dict.get("prepend_script", []), append_script=resources_dict.get("append_script", []), wait_time=resources_dict.get("wait_time", 0), + sge_pe_name=resources_dict.get("sge_pe_name", ""), **resources_dict.get("kwargs", {}), ) return resources @@ -1229,6 +1235,7 @@ def arginfo(detail_kwargs=True): Argument( "wait_time", [int, float], optional=True, doc=doc_wait_time, default=0 ), + Argument("sge_pe_name", str, optional=True, doc="The name of sge's parallel environment."), ] if detail_kwargs: From 56e3afe5f77efe2124720b436962255f3d5c7e6a Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:09:39 +0900 Subject: [PATCH 06/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 93e870f5..a918617c 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -214,7 +214,7 @@ def gen_script_header(self, job): else: sge_pe_name = "mpi" sge_script_header_dict["select_node_line"] = ( - f"#$ -pe {sge_pe_name} {resources.cpu_per_node} " + f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" ) if resources.queue_name != '': sge_script_header_dict["select_node_line"] += ( From 703aa4933371fad850d94ca7e74f755208b84c3f Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 18:21:10 +0900 Subject: [PATCH 07/76] y --- dpdispatcher/machines/pbs.py | 6 +----- dpdispatcher/submission.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index a918617c..d3937a3f 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -209,12 +209,8 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources sge_script_header_dict = {} - if resources.sge_pe_name != '': - sge_pe_name = resources.sge_pe_name - else: - sge_pe_name = "mpi" sge_script_header_dict["select_node_line"] = ( - f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" + f"#$ -pe {resources.sge_pe_name} {resources.cpu_per_node}\n" ) if resources.queue_name != '': sge_script_header_dict["select_node_line"] += ( diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 41b5dee0..8f69c060 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -1095,7 +1095,7 @@ def deserialize(cls, resources_dict): prepend_script=resources_dict.get("prepend_script", []), append_script=resources_dict.get("append_script", []), wait_time=resources_dict.get("wait_time", 0), - sge_pe_name=resources_dict.get("sge_pe_name", ""), + sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), **resources_dict.get("kwargs", {}), ) return resources From 2c3b38aa8dd43a07ca849646d410339df4695ef7 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 26 Mar 2024 19:48:15 +0900 Subject: [PATCH 08/76] Update submission.py --- dpdispatcher/submission.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 8f69c060..2f76c343 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -991,6 +991,7 @@ def __init__( cpu_per_node, gpu_per_node, queue_name, + sge_pe_name, group_size, *, custom_flags=[], @@ -1004,7 +1005,6 @@ def __init__( prepend_script=[], append_script=[], wait_time=0, - sge_pe_name="", **kwargs, ): self.number_node = number_node @@ -1059,6 +1059,7 @@ def serialize(self): resources_dict["cpu_per_node"] = self.cpu_per_node resources_dict["gpu_per_node"] = self.gpu_per_node resources_dict["queue_name"] = self.queue_name + resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["group_size"] = self.group_size resources_dict["custom_flags"] = self.custom_flags @@ -1072,7 +1073,6 @@ def serialize(self): resources_dict["prepend_script"] = self.prepend_script resources_dict["append_script"] = self.append_script resources_dict["wait_time"] = self.wait_time - resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["kwargs"] = self.kwargs return resources_dict @@ -1083,6 +1083,7 @@ def deserialize(cls, resources_dict): cpu_per_node=resources_dict.get("cpu_per_node", 1), gpu_per_node=resources_dict.get("gpu_per_node", 0), queue_name=resources_dict.get("queue_name", ""), + sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), group_size=resources_dict["group_size"], custom_flags=resources_dict.get("custom_flags", []), strategy=resources_dict.get("strategy", default_strategy), @@ -1095,7 +1096,6 @@ def deserialize(cls, resources_dict): prepend_script=resources_dict.get("prepend_script", []), append_script=resources_dict.get("append_script", []), wait_time=resources_dict.get("wait_time", 0), - sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), **resources_dict.get("kwargs", {}), ) return resources @@ -1196,6 +1196,7 @@ def arginfo(detail_kwargs=True): "gpu_per_node", int, optional=True, doc=doc_gpu_per_node, default=0 ), Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""), + Argument("sge_pe_name", str, optional=True, doc="The name of sge's parallel environment."), Argument("group_size", int, optional=False, doc=doc_group_size), Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags), # Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy), @@ -1235,7 +1236,6 @@ def arginfo(detail_kwargs=True): Argument( "wait_time", [int, float], optional=True, doc=doc_wait_time, default=0 ), - Argument("sge_pe_name", str, optional=True, doc="The name of sge's parallel environment."), ] if detail_kwargs: From f73e0f9263bb80c7bb3ece680cbc5c1f7732d41f Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 00:06:29 +0900 Subject: [PATCH 09/76] u --- dpdispatcher/machine.py | 3 +-- dpdispatcher/machines/pbs.py | 7 ++++--- dpdispatcher/utils/utils.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 214a4902..1f0e6f80 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -261,8 +261,7 @@ def gen_script_env(self, job): source_list = job.resources.source_list for ii in source_list: - line = "{ source %s; } \n" % ii - source_files_part += line + source_files_part += f"source {ii}\n" export_envs_part = "" envs = job.resources.envs diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index d3937a3f..7e49b709 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -181,10 +181,10 @@ def gen_script_header(self, job): sge_script_header_template = """ #!/bin/bash -#$ -N dpdispatcher_submit -{select_node_line} +#$ -S /bin/bash #$ -cwd - +#$ -N dp_job +{select_node_line} """ @@ -230,6 +230,7 @@ def gen_script_header(self, job): def do_submit(self, job): script_file_name = job.script_file_name script_str = self.gen_script(job) + script_str = script_str.replace(f"source $REMOTE_ROOT/{job.script_file_name}.run", f"source $REMOTE_ROOT/{job.script_file_name}") job_id_name = job.job_hash + "_job_id" self.context.write_file(fname=script_file_name, write_str=script_str) script_file_dir = self.context.remote_root diff --git a/dpdispatcher/utils/utils.py b/dpdispatcher/utils/utils.py index cec28f54..9f54c490 100644 --- a/dpdispatcher/utils/utils.py +++ b/dpdispatcher/utils/utils.py @@ -42,7 +42,7 @@ def hotp(key: str, period: int, token_length: int = 6, digest="sha1"): period_ = struct.pack(">Q", period) mac = hmac.new(key_, period_, digest).digest() offset = mac[-1] & 0x0F - binary = struct.unpack(">L", mac[offset : offset + 4])[0] & 0x7FFFFFFF + binary = struct.unpack(">L", mac[offset: offset + 4])[0] & 0x7FFFFFFF return str(binary)[-token_length:].zfill(token_length) From 957cf4aeafedd30e7bf3d8a941ecdab6b51c472a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 15:12:02 +0000 Subject: [PATCH 10/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 51 ++++++++++++++++++++++-------------- dpdispatcher/submission.py | 9 +++++-- dpdispatcher/utils/utils.py | 2 +- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 7e49b709..b3199cb9 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -189,20 +189,24 @@ def gen_script_header(self, job): class SGE(PBS): - def __init__(self, - batch_type=None, - context_type=None, - local_root=None, - remote_root=None, - remote_profile={}, - *, - context=None,): - super(PBS, self).__init__(batch_type, - context_type, - local_root, - remote_root, - remote_profile, - context=context,) + def __init__( + self, + batch_type=None, + context_type=None, + local_root=None, + remote_root=None, + remote_profile={}, + *, + context=None, + ): + super(PBS, self).__init__( + batch_type, + context_type, + local_root, + remote_root, + remote_profile, + context=context, + ) def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml @@ -212,25 +216,32 @@ def gen_script_header(self, job): sge_script_header_dict["select_node_line"] = ( f"#$ -pe {resources.sge_pe_name} {resources.cpu_per_node}\n" ) - if resources.queue_name != '': + if resources.queue_name != "": sge_script_header_dict["select_node_line"] += ( f"#$ -q {resources.queue_name}" ) - if (resources["strategy"].get("customized_script_header_template_file") - is not None): + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): sge_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], - resources,) + resources, + ) else: sge_script_header = sge_script_header_template.format( - **sge_script_header_dict) + **sge_script_header_dict + ) return sge_script_header def do_submit(self, job): script_file_name = job.script_file_name script_str = self.gen_script(job) - script_str = script_str.replace(f"source $REMOTE_ROOT/{job.script_file_name}.run", f"source $REMOTE_ROOT/{job.script_file_name}") + script_str = script_str.replace( + f"source $REMOTE_ROOT/{job.script_file_name}.run", + f"source $REMOTE_ROOT/{job.script_file_name}", + ) job_id_name = job.job_hash + "_job_id" self.context.write_file(fname=script_file_name, write_str=script_str) script_file_dir = self.context.remote_root diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 2f76c343..b79204df 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -473,7 +473,7 @@ def generate_jobs(self): random_task_index = list(range(task_num)) random.shuffle(random_task_index) random_task_index_ll = [ - random_task_index[ii: ii + group_size] + random_task_index[ii : ii + group_size] for ii in range(0, task_num, group_size) ] @@ -1196,7 +1196,12 @@ def arginfo(detail_kwargs=True): "gpu_per_node", int, optional=True, doc=doc_gpu_per_node, default=0 ), Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""), - Argument("sge_pe_name", str, optional=True, doc="The name of sge's parallel environment."), + Argument( + "sge_pe_name", + str, + optional=True, + doc="The name of sge's parallel environment.", + ), Argument("group_size", int, optional=False, doc=doc_group_size), Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags), # Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy), diff --git a/dpdispatcher/utils/utils.py b/dpdispatcher/utils/utils.py index 9f54c490..cec28f54 100644 --- a/dpdispatcher/utils/utils.py +++ b/dpdispatcher/utils/utils.py @@ -42,7 +42,7 @@ def hotp(key: str, period: int, token_length: int = 6, digest="sha1"): period_ = struct.pack(">Q", period) mac = hmac.new(key_, period_, digest).digest() offset = mac[-1] & 0x0F - binary = struct.unpack(">L", mac[offset: offset + 4])[0] & 0x7FFFFFFF + binary = struct.unpack(">L", mac[offset : offset + 4])[0] & 0x7FFFFFFF return str(binary)[-token_length:].zfill(token_length) From 034c9d7d4a673b2325e85075b9350fb783e407ba Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 02:17:14 +0900 Subject: [PATCH 11/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 7e49b709..ec464f65 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -4,6 +4,7 @@ from dpdispatcher.machine import Machine from dpdispatcher.utils.job_status import JobStatus from dpdispatcher.utils.utils import customized_script_header_template +from pathlib import Path pbs_script_header_template = """ #!/bin/bash -l @@ -219,9 +220,8 @@ def gen_script_header(self, job): if (resources["strategy"].get("customized_script_header_template_file") is not None): - sge_script_header = customized_script_header_template( - resources["strategy"]["customized_script_header_template_file"], - resources,) + filename = self.context.remote_root / Path(resources["strategy"]["customized_script_header_template_file"]) + sge_script_header = customized_script_header_template(str(filename.as_posix()), resources) else: sge_script_header = sge_script_header_template.format( **sge_script_header_dict) @@ -230,9 +230,11 @@ def gen_script_header(self, job): def do_submit(self, job): script_file_name = job.script_file_name script_str = self.gen_script(job) - script_str = script_str.replace(f"source $REMOTE_ROOT/{job.script_file_name}.run", f"source $REMOTE_ROOT/{job.script_file_name}") job_id_name = job.job_hash + "_job_id" self.context.write_file(fname=script_file_name, write_str=script_str) + script_run_str = self.gen_script_command(job) + script_run_file_name = f"{job.script_file_name}.run" + self.context.write_file(fname=script_run_file_name, write_str=script_run_str) script_file_dir = self.context.remote_root stdin, stdout, stderr = self.context.block_checkcall( "cd {} && {} {}".format(script_file_dir, "qsub", script_file_name) From 3f07742f58c974c43f9069d4606e59e1830e9bb2 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 02:25:05 +0900 Subject: [PATCH 12/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index ec464f65..2a78c74e 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -4,7 +4,6 @@ from dpdispatcher.machine import Machine from dpdispatcher.utils.job_status import JobStatus from dpdispatcher.utils.utils import customized_script_header_template -from pathlib import Path pbs_script_header_template = """ #!/bin/bash -l @@ -220,8 +219,8 @@ def gen_script_header(self, job): if (resources["strategy"].get("customized_script_header_template_file") is not None): - filename = self.context.remote_root / Path(resources["strategy"]["customized_script_header_template_file"]) - sge_script_header = customized_script_header_template(str(filename.as_posix()), resources) + filename = resources["strategy"]["customized_script_header_template_file"] + sge_script_header = customized_script_header_template(filename, resources) else: sge_script_header = sge_script_header_template.format( **sge_script_header_dict) From ef28a1b1022cfc409ef96b62759b38bfac42cd0f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:31:17 +0000 Subject: [PATCH 13/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 781d1e81..23d9ff76 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -221,9 +221,10 @@ def gen_script_header(self, job): f"#$ -q {resources.queue_name}" ) - - if (resources["strategy"].get("customized_script_header_template_file") - is not None): + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): filename = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(filename, resources) else: From 65dca91e70defaab9955525fde53c18e61fdbb54 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:15:34 +0900 Subject: [PATCH 14/76] u --- tests/sample_class.py | 1 + tests/test_argcheck.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/sample_class.py b/tests/sample_class.py index 7c663094..3a0b5492 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -42,6 +42,7 @@ def get_sample_resources_dict(cls): "cpu_per_node": 4, "gpu_per_node": 1, "queue_name": "T4_4_15", + "sge_pe_name": "mpi", "group_size": 2, "custom_flags": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, diff --git a/tests/test_argcheck.py b/tests/test_argcheck.py index 2ef1883e..67f03f93 100644 --- a/tests/test_argcheck.py +++ b/tests/test_argcheck.py @@ -60,6 +60,7 @@ def test_resources_argcheck(self): "para_deg": 1, "prepend_script": [], "queue_name": "haha", + 'sge_pe_name': 'mpi', "source_list": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, "wait_time": 0, From 8d7b56627f5b148d0999e6426e9db9290b46cda8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 05:17:09 +0000 Subject: [PATCH 15/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_argcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_argcheck.py b/tests/test_argcheck.py index 67f03f93..f007986d 100644 --- a/tests/test_argcheck.py +++ b/tests/test_argcheck.py @@ -60,7 +60,7 @@ def test_resources_argcheck(self): "para_deg": 1, "prepend_script": [], "queue_name": "haha", - 'sge_pe_name': 'mpi', + "sge_pe_name": "mpi", "source_list": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, "wait_time": 0, From 8c50c4f595ee34f03ad1c9759f88b41f381504f4 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:21:51 +0900 Subject: [PATCH 16/76] u --- dpdispatcher/machines/pbs.py | 1 - tests/sample_class.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 781d1e81..ac9cf366 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -221,7 +221,6 @@ def gen_script_header(self, job): f"#$ -q {resources.queue_name}" ) - if (resources["strategy"].get("customized_script_header_template_file") is not None): filename = resources["strategy"]["customized_script_header_template_file"] diff --git a/tests/sample_class.py b/tests/sample_class.py index 3a0b5492..94a95bbd 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -24,6 +24,7 @@ def get_sample_resources(cls): cpu_per_node=4, gpu_per_node=1, queue_name="T4_4_15", + sge_pe_name= "mpi", group_size=2, custom_flags=[], strategy={"if_cuda_multi_devices": False}, From 920984d3acd6730b827800083ac5198d356ab1c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 05:23:35 +0000 Subject: [PATCH 17/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 6 ++++-- tests/sample_class.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index ac9cf366..23d9ff76 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -221,8 +221,10 @@ def gen_script_header(self, job): f"#$ -q {resources.queue_name}" ) - if (resources["strategy"].get("customized_script_header_template_file") - is not None): + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): filename = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(filename, resources) else: diff --git a/tests/sample_class.py b/tests/sample_class.py index 94a95bbd..9d5985e6 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -24,7 +24,7 @@ def get_sample_resources(cls): cpu_per_node=4, gpu_per_node=1, queue_name="T4_4_15", - sge_pe_name= "mpi", + sge_pe_name="mpi", group_size=2, custom_flags=[], strategy={"if_cuda_multi_devices": False}, From c8f565709f00fd8bb48841a9c4dd00ddc07e4858 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 14:41:48 +0900 Subject: [PATCH 18/76] y --- dpdispatcher/submission.py | 22 +++++++++------------- tests/sample_class.py | 4 ++-- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index b79204df..cacdcc85 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -473,7 +473,7 @@ def generate_jobs(self): random_task_index = list(range(task_num)) random.shuffle(random_task_index) random_task_index_ll = [ - random_task_index[ii : ii + group_size] + random_task_index[ii: ii + group_size] for ii in range(0, task_num, group_size) ] @@ -961,6 +961,8 @@ class Resources: The queue name of batch job scheduler system. group_size : int The number of `tasks` in a `job`. + sge_pe_name : str + The parallel environment name of SGE. custom_flags : list of Str The extra lines pass to job submitting script header strategy : dict @@ -981,8 +983,6 @@ class Resources: The env file to be sourced before the command execution. wait_time : int The waitting time in second after a single task submitted. Default: 0. - sge_pe_name : str - The parallel environment name of SGE. """ def __init__( @@ -991,9 +991,9 @@ def __init__( cpu_per_node, gpu_per_node, queue_name, - sge_pe_name, group_size, *, + sge_pe_name="mpi", custom_flags=[], strategy=default_strategy, para_deg=1, @@ -1014,6 +1014,7 @@ def __init__( self.group_size = group_size # self.extra_specification = extra_specification + self.sge_pe_name = sge_pe_name self.custom_flags = custom_flags self.strategy = strategy self.para_deg = para_deg @@ -1025,7 +1026,6 @@ def __init__( self.prepend_script = prepend_script self.append_script = append_script self.wait_time = wait_time - self.sge_pe_name = sge_pe_name # self.if_cuda_multi_devices = if_cuda_multi_devices self.kwargs = kwargs.get("kwargs", kwargs) @@ -1059,9 +1059,9 @@ def serialize(self): resources_dict["cpu_per_node"] = self.cpu_per_node resources_dict["gpu_per_node"] = self.gpu_per_node resources_dict["queue_name"] = self.queue_name - resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["group_size"] = self.group_size + resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["custom_flags"] = self.custom_flags resources_dict["strategy"] = self.strategy resources_dict["para_deg"] = self.para_deg @@ -1083,8 +1083,8 @@ def deserialize(cls, resources_dict): cpu_per_node=resources_dict.get("cpu_per_node", 1), gpu_per_node=resources_dict.get("gpu_per_node", 0), queue_name=resources_dict.get("queue_name", ""), - sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), group_size=resources_dict["group_size"], + sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), custom_flags=resources_dict.get("custom_flags", []), strategy=resources_dict.get("strategy", default_strategy), para_deg=resources_dict.get("para_deg", 1), @@ -1133,6 +1133,7 @@ def arginfo(detail_kwargs=True): doc_gpu_per_node = "gpu numbers of each node assigned to each job." doc_queue_name = "The queue name of batch job scheduler system." doc_group_size = "The number of `tasks` in a `job`. 0 means infinity." + doc_sge_pe_name = "The parallel environment name of SGE." doc_custom_flags = "The extra lines pass to job submitting script header" doc_para_deg = "Decide how many tasks will be run in parallel." doc_source_list = "The env file to be sourced before the command execution." @@ -1196,13 +1197,8 @@ def arginfo(detail_kwargs=True): "gpu_per_node", int, optional=True, doc=doc_gpu_per_node, default=0 ), Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""), - Argument( - "sge_pe_name", - str, - optional=True, - doc="The name of sge's parallel environment.", - ), Argument("group_size", int, optional=False, doc=doc_group_size), + Argument("sge_pe_name", str, optional=True, doc=doc_sge_pe_name), Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags), # Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy), strategy_format, diff --git a/tests/sample_class.py b/tests/sample_class.py index 94a95bbd..177856d9 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -24,8 +24,8 @@ def get_sample_resources(cls): cpu_per_node=4, gpu_per_node=1, queue_name="T4_4_15", - sge_pe_name= "mpi", group_size=2, + sge_pe_name= "mpi", custom_flags=[], strategy={"if_cuda_multi_devices": False}, para_deg=1, @@ -43,8 +43,8 @@ def get_sample_resources_dict(cls): "cpu_per_node": 4, "gpu_per_node": 1, "queue_name": "T4_4_15", - "sge_pe_name": "mpi", "group_size": 2, + "sge_pe_name": "mpi", "custom_flags": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, "para_deg": 1, From 6d6c9732d25a20d3e99ec3b90952c8a39d4224d1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 05:45:50 +0000 Subject: [PATCH 19/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/submission.py | 2 +- tests/sample_class.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index cacdcc85..76849966 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -473,7 +473,7 @@ def generate_jobs(self): random_task_index = list(range(task_num)) random.shuffle(random_task_index) random_task_index_ll = [ - random_task_index[ii: ii + group_size] + random_task_index[ii : ii + group_size] for ii in range(0, task_num, group_size) ] diff --git a/tests/sample_class.py b/tests/sample_class.py index 177856d9..5f381ecf 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -25,7 +25,7 @@ def get_sample_resources(cls): gpu_per_node=1, queue_name="T4_4_15", group_size=2, - sge_pe_name= "mpi", + sge_pe_name="mpi", custom_flags=[], strategy={"if_cuda_multi_devices": False}, para_deg=1, From dae7909edada874cc4a3c09d73fdf72af557d105 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:46:12 +0900 Subject: [PATCH 20/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 23d9ff76..4e8ecf43 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -220,13 +220,9 @@ def gen_script_header(self, job): sge_script_header_dict["select_node_line"] += ( f"#$ -q {resources.queue_name}" ) - - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): - filename = resources["strategy"]["customized_script_header_template_file"] - sge_script_header = customized_script_header_template(filename, resources) + if resources["strategy"].get("customized_script_header_template_file") is not None: + file_name = resources["strategy"]["customized_script_header_template_file"] + sge_script_header = customized_script_header_template(file_name, resources) else: sge_script_header = sge_script_header_template.format( **sge_script_header_dict From 83644c3494ae2ec6ba7c3187dc8d30bc8f3848c9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 08:51:34 +0000 Subject: [PATCH 21/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 4e8ecf43..0e268c55 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -220,7 +220,10 @@ def gen_script_header(self, job): sge_script_header_dict["select_node_line"] += ( f"#$ -q {resources.queue_name}" ) - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: From b1758d9fb6424f68153e093c42ddc7adc27fc7ba Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" <46436648+thangckt@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:16:03 +0900 Subject: [PATCH 22/76] Create _version.py --- dpdispatcher/_version.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 dpdispatcher/_version.py diff --git a/dpdispatcher/_version.py b/dpdispatcher/_version.py new file mode 100644 index 00000000..613a45d1 --- /dev/null +++ b/dpdispatcher/_version.py @@ -0,0 +1 @@ +version = 0.6.5_dev From 62a3459dc7913fe018a33927fc705c8150a520f8 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" <46436648+thangckt@users.noreply.github.com> Date: Thu, 4 Apr 2024 13:28:01 +0900 Subject: [PATCH 23/76] Delete dpdispatcher/_version.py --- dpdispatcher/_version.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 dpdispatcher/_version.py diff --git a/dpdispatcher/_version.py b/dpdispatcher/_version.py deleted file mode 100644 index 613a45d1..00000000 --- a/dpdispatcher/_version.py +++ /dev/null @@ -1 +0,0 @@ -version = 0.6.5_dev From 2ae2e62d474cdcf2817f5af63a348b9b8b678fcb Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" <46436648+thangckt@users.noreply.github.com> Date: Sun, 14 Apr 2024 02:11:23 +0900 Subject: [PATCH 24/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 4e8ecf43..7d6edb10 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -183,7 +183,7 @@ def gen_script_header(self, job): #!/bin/bash #$ -S /bin/bash #$ -cwd -#$ -N dp_job +#$ -N DPjob {select_node_line} """ From 99589d3d8888098b0dd9420caef661ed88ddcf80 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 29 May 2024 10:12:40 +0900 Subject: [PATCH 25/76] u --- dpdispatcher/machines/pbs.py | 32 +++++++++++++++++++++++++++++++- dpdispatcher/submission.py | 8 -------- tests/sample_class.py | 2 -- tests/test_argcheck.py | 1 - 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 3845ffd8..276e7f3b 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -1,4 +1,7 @@ import shlex +from typing import List + +from dargs import Argument from dpdispatcher.dlog import dlog from dpdispatcher.machine import Machine @@ -212,9 +215,10 @@ def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml # resources.number_node is not used in SGE resources = job.resources + sge_pe_name = resources.kwargs.get("sge_pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = ( - f"#$ -pe {resources.sge_pe_name} {resources.cpu_per_node}\n" + f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" ) if resources.queue_name != "": sge_script_header_dict["select_node_line"] += ( @@ -297,3 +301,29 @@ def check_status(self, job): def check_finish_tag(self, job): job_tag_finished = job.job_hash + "_job_tag_finished" return self.context.check_file_exists(job_tag_finished) + + @classmethod + def resources_subfields(cls) -> List[Argument]: + """Generate the resources subfields. + + sge_pe_name : str + The parallel environment name of SGE. + + Returns + ------- + list[Argument] + resources subfields + """ + doc_sge_pe_name = "The parallel environment name of SGE." + + return [ + Argument( + "kwargs", + dict, + [ + Argument("sge_pe_name", str, optional=True, default="mpi", doc=doc_sge_pe_name), + ], + optional=False, + doc="Extra arguments.", + ) + ] diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 76849966..9c5c73ca 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -961,8 +961,6 @@ class Resources: The queue name of batch job scheduler system. group_size : int The number of `tasks` in a `job`. - sge_pe_name : str - The parallel environment name of SGE. custom_flags : list of Str The extra lines pass to job submitting script header strategy : dict @@ -993,7 +991,6 @@ def __init__( queue_name, group_size, *, - sge_pe_name="mpi", custom_flags=[], strategy=default_strategy, para_deg=1, @@ -1014,7 +1011,6 @@ def __init__( self.group_size = group_size # self.extra_specification = extra_specification - self.sge_pe_name = sge_pe_name self.custom_flags = custom_flags self.strategy = strategy self.para_deg = para_deg @@ -1061,7 +1057,6 @@ def serialize(self): resources_dict["queue_name"] = self.queue_name resources_dict["group_size"] = self.group_size - resources_dict["sge_pe_name"] = self.sge_pe_name resources_dict["custom_flags"] = self.custom_flags resources_dict["strategy"] = self.strategy resources_dict["para_deg"] = self.para_deg @@ -1084,7 +1079,6 @@ def deserialize(cls, resources_dict): gpu_per_node=resources_dict.get("gpu_per_node", 0), queue_name=resources_dict.get("queue_name", ""), group_size=resources_dict["group_size"], - sge_pe_name=resources_dict.get("sge_pe_name", "mpi"), custom_flags=resources_dict.get("custom_flags", []), strategy=resources_dict.get("strategy", default_strategy), para_deg=resources_dict.get("para_deg", 1), @@ -1133,7 +1127,6 @@ def arginfo(detail_kwargs=True): doc_gpu_per_node = "gpu numbers of each node assigned to each job." doc_queue_name = "The queue name of batch job scheduler system." doc_group_size = "The number of `tasks` in a `job`. 0 means infinity." - doc_sge_pe_name = "The parallel environment name of SGE." doc_custom_flags = "The extra lines pass to job submitting script header" doc_para_deg = "Decide how many tasks will be run in parallel." doc_source_list = "The env file to be sourced before the command execution." @@ -1198,7 +1191,6 @@ def arginfo(detail_kwargs=True): ), Argument("queue_name", str, optional=True, doc=doc_queue_name, default=""), Argument("group_size", int, optional=False, doc=doc_group_size), - Argument("sge_pe_name", str, optional=True, doc=doc_sge_pe_name), Argument("custom_flags", List[str], optional=True, doc=doc_custom_flags), # Argument("strategy", dict, optional=True, doc=doc_strategy,default=default_strategy), strategy_format, diff --git a/tests/sample_class.py b/tests/sample_class.py index 5f381ecf..7c663094 100644 --- a/tests/sample_class.py +++ b/tests/sample_class.py @@ -25,7 +25,6 @@ def get_sample_resources(cls): gpu_per_node=1, queue_name="T4_4_15", group_size=2, - sge_pe_name="mpi", custom_flags=[], strategy={"if_cuda_multi_devices": False}, para_deg=1, @@ -44,7 +43,6 @@ def get_sample_resources_dict(cls): "gpu_per_node": 1, "queue_name": "T4_4_15", "group_size": 2, - "sge_pe_name": "mpi", "custom_flags": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, "para_deg": 1, diff --git a/tests/test_argcheck.py b/tests/test_argcheck.py index f007986d..2ef1883e 100644 --- a/tests/test_argcheck.py +++ b/tests/test_argcheck.py @@ -60,7 +60,6 @@ def test_resources_argcheck(self): "para_deg": 1, "prepend_script": [], "queue_name": "haha", - "sge_pe_name": "mpi", "source_list": [], "strategy": {"if_cuda_multi_devices": False, "ratio_unfinished": 0.0}, "wait_time": 0, From 711da1df3908964de2c67cf51f6dd35ffb5e0c9b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 May 2024 01:12:51 +0000 Subject: [PATCH 26/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 276e7f3b..adfab6d9 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -321,8 +321,14 @@ def resources_subfields(cls) -> List[Argument]: "kwargs", dict, [ - Argument("sge_pe_name", str, optional=True, default="mpi", doc=doc_sge_pe_name), - ], + Argument( + "sge_pe_name", + str, + optional=True, + default="mpi", + doc=doc_sge_pe_name, + ), + ], optional=False, doc="Extra arguments.", ) From 1e0c21ebb8dd879bfcd7606eabbe3ba9924b8c63 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 29 May 2024 10:25:42 +0900 Subject: [PATCH 27/76] Update test_lsf_script_generation.py --- tests/test_lsf_script_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_lsf_script_generation.py b/tests/test_lsf_script_generation.py index e638101b..97be98a0 100755 --- a/tests/test_lsf_script_generation.py +++ b/tests/test_lsf_script_generation.py @@ -108,8 +108,8 @@ def test_shell_trival(self): module load use.own module load deepmd/1.3 - {{ source /data/home/ypliu/scripts/avail_gpu.sh; }} - {{ source /data/home/ypliu/dprun/tf_envs.sh; }} + source /data/home/ypliu/scripts/avail_gpu.sh; + source /data/home/ypliu/dprun/tf_envs.sh; export DP_DISPATCHER_EXPORT=test_foo_bar_baz From e530cf6fce3e0b423568e1c8fe521942ff11c1f4 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 29 May 2024 19:43:17 +0900 Subject: [PATCH 28/76] Update test_lsf_script_generation.py --- tests/test_lsf_script_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_lsf_script_generation.py b/tests/test_lsf_script_generation.py index 97be98a0..48eb41d9 100755 --- a/tests/test_lsf_script_generation.py +++ b/tests/test_lsf_script_generation.py @@ -108,8 +108,8 @@ def test_shell_trival(self): module load use.own module load deepmd/1.3 - source /data/home/ypliu/scripts/avail_gpu.sh; - source /data/home/ypliu/dprun/tf_envs.sh; + source /data/home/ypliu/scripts/avail_gpu.sh + source /data/home/ypliu/dprun/tf_envs.sh export DP_DISPATCHER_EXPORT=test_foo_bar_baz From 4084a3118d39235140481d153ae900397633a64d Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:49:49 +0900 Subject: [PATCH 29/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 76 ++++++++++-------------------------- 1 file changed, 20 insertions(+), 56 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index adfab6d9..5fe6ee7e 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,22 +28,15 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":ngpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header def do_submit(self, job): @@ -60,9 +53,7 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format( - shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) - ) + "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -86,8 +77,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -136,8 +126,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -163,22 +152,15 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":gpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header @@ -186,7 +168,6 @@ def gen_script_header(self, job): #!/bin/bash #$ -S /bin/bash #$ -cwd -#$ -N DPjob {select_node_line} """ @@ -215,25 +196,18 @@ def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml # resources.number_node is not used in SGE resources = job.resources + job_name = resources.kwargs.get("job_name", "wDPjob") sge_pe_name = resources.kwargs.get("sge_pe_name", "mpi") sge_script_header_dict = {} - sge_script_header_dict["select_node_line"] = ( - f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" - ) + sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" + sge_script_header_dict["select_node_line"] += f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += ( - f"#$ -q {resources.queue_name}" - ) - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" + if resources["strategy"].get("customized_script_header_template_file") is not None: file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format( - **sge_script_header_dict - ) + sge_script_header = sge_script_header_template.format(**sge_script_header_dict) return sge_script_header def do_submit(self, job): @@ -264,10 +238,7 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) - ) + raise RuntimeError("status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret)) status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -278,10 +249,7 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info( - "not tag_finished detected, execute sync command and wait. count " - + str(count) - ) + dlog.info("not tag_finished detected, execute sync command and wait. count " + str(count)) self.context.block_call("sync") import time @@ -315,19 +283,15 @@ def resources_subfields(cls) -> List[Argument]: resources subfields """ doc_sge_pe_name = "The parallel environment name of SGE." + doc_job_name = "The name of SGE's job." return [ Argument( "kwargs", dict, [ - Argument( - "sge_pe_name", - str, - optional=True, - default="mpi", - doc=doc_sge_pe_name, - ), + Argument("sge_pe_name", str, optional=True, default="mpi", doc=doc_sge_pe_name), + Argument("job_name", str, optional=True, default="wDPjob", doc=doc_job_name), ], optional=False, doc="Extra arguments.", From 9b1c566fb6e276092fc72dd400e822c2d5645571 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Jul 2024 06:54:44 +0000 Subject: [PATCH 30/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/pbs.py | 79 ++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 5fe6ee7e..034c76d2 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,15 +28,22 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":ngpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header def do_submit(self, job): @@ -53,7 +60,9 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) + "cd {} && {} {}".format( + shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) + ) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -77,7 +86,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -126,7 +136,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -152,15 +163,22 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":gpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header @@ -200,14 +218,23 @@ def gen_script_header(self, job): sge_pe_name = resources.kwargs.get("sge_pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" + sge_script_header_dict["select_node_line"] += ( + f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" + ) if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + sge_script_header_dict["select_node_line"] += ( + f"#$ -q {resources.queue_name}" + ) + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format(**sge_script_header_dict) + sge_script_header = sge_script_header_template.format( + **sge_script_header_dict + ) return sge_script_header def do_submit(self, job): @@ -238,7 +265,10 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError("status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret)) + raise RuntimeError( + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) + ) status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -249,7 +279,10 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info("not tag_finished detected, execute sync command and wait. count " + str(count)) + dlog.info( + "not tag_finished detected, execute sync command and wait. count " + + str(count) + ) self.context.block_call("sync") import time @@ -290,8 +323,20 @@ def resources_subfields(cls) -> List[Argument]: "kwargs", dict, [ - Argument("sge_pe_name", str, optional=True, default="mpi", doc=doc_sge_pe_name), - Argument("job_name", str, optional=True, default="wDPjob", doc=doc_job_name), + Argument( + "sge_pe_name", + str, + optional=True, + default="mpi", + doc=doc_sge_pe_name, + ), + Argument( + "job_name", + str, + optional=True, + default="wDPjob", + doc=doc_job_name, + ), ], optional=False, doc="Extra arguments.", From fc690aeb79723d9f41f84289bda1a41bf52a3874 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:19:27 +0900 Subject: [PATCH 31/76] change `sge_pe_name` to `pe_name` --- dpdispatcher/machines/pbs.py | 73 ++++++++++-------------------------- 1 file changed, 20 insertions(+), 53 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 034c76d2..8f4c1e5e 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,22 +28,15 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":ngpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header def do_submit(self, job): @@ -60,9 +53,7 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format( - shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) - ) + "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -86,8 +77,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -136,8 +126,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -163,22 +152,15 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":gpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header @@ -215,26 +197,17 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources job_name = resources.kwargs.get("job_name", "wDPjob") - sge_pe_name = resources.kwargs.get("sge_pe_name", "mpi") + pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += ( - f"#$ -pe {sge_pe_name} {resources.cpu_per_node}\n" - ) + sge_script_header_dict["select_node_line"] += f"#$ -pe {pe_name} {resources.cpu_per_node}\n" if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += ( - f"#$ -q {resources.queue_name}" - ) - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" + if resources["strategy"].get("customized_script_header_template_file") is not None: file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format( - **sge_script_header_dict - ) + sge_script_header = sge_script_header_template.format(**sge_script_header_dict) return sge_script_header def do_submit(self, job): @@ -265,10 +238,7 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) - ) + raise RuntimeError(f"status command qstat fails to execute. erro info: {err_str} return code {ret}") status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -279,10 +249,7 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info( - "not tag_finished detected, execute sync command and wait. count " - + str(count) - ) + dlog.info(f"not tag_finished detected, execute sync command and wait. count {count}") self.context.block_call("sync") import time @@ -307,7 +274,7 @@ def check_finish_tag(self, job): def resources_subfields(cls) -> List[Argument]: """Generate the resources subfields. - sge_pe_name : str + pe_name : str The parallel environment name of SGE. Returns @@ -315,7 +282,7 @@ def resources_subfields(cls) -> List[Argument]: list[Argument] resources subfields """ - doc_sge_pe_name = "The parallel environment name of SGE." + doc_pe_name = "The parallel environment name of SGE system." doc_job_name = "The name of SGE's job." return [ @@ -324,11 +291,11 @@ def resources_subfields(cls) -> List[Argument]: dict, [ Argument( - "sge_pe_name", + "pe_name", str, optional=True, default="mpi", - doc=doc_sge_pe_name, + doc=doc_pe_name, ), Argument( "job_name", From cfbc5ddfe79670165d69c893918d576d708fd891 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:28:13 +0900 Subject: [PATCH 32/76] Update ssh_context.py --- dpdispatcher/contexts/ssh_context.py | 193 +++++++++------------------ 1 file changed, 60 insertions(+), 133 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 959b6445..39247c63 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -87,10 +87,7 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError( - "cannot connect ssh after %d failures at interval %d s" - % (max_check, sleep_time) - ) + raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -168,9 +165,7 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file( - key_path, self.passphrase - ) + key = pkey_class.from_private_key_file(key_path, self.passphrase) except paramiko.SSHException as e: pass if key is not None: @@ -185,9 +180,7 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join( - os.path.expanduser("~"), directory, f"id_{name}" - ) + full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -220,9 +213,7 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError( - "Authentication failed, try to provide password" - ) from key_error + raise RuntimeError("Authentication failed, try to provide password") from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -269,11 +260,7 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif ( - "verification" in pr_str - or "token" in pr_str - and self.totp_secret is not None - ): + elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -321,19 +308,17 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " - "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" - " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." ) - doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." - doc_look_for_keys = ( - "enable searching for discoverable private key files in ~/.ssh/" + doc_tar_compress = ( + "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." ) + doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), Argument("username", str, optional=False, doc=doc_username), @@ -354,12 +339,8 @@ def arginfo(): doc=doc_passphrase, ), Argument("timeout", int, optional=True, default=10, doc=doc_timeout), - Argument( - "totp_secret", str, optional=True, default=None, doc=doc_totp_secret - ), - Argument( - "tar_compress", bool, optional=True, default=True, doc=doc_tar_compress - ), + Argument("totp_secret", str, optional=True, default=None, doc=doc_totp_secret), + Argument("tar_compress", bool, optional=True, default=True, doc=doc_tar_compress), Argument( "look_for_keys", bool, @@ -368,9 +349,7 @@ def arginfo(): doc=doc_look_for_keys, ), ] - ssh_remote_profile_format = Argument( - "ssh_session", dict, ssh_remote_profile_args - ) + ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -498,9 +477,7 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath( - os.path.join(self.temp_local_root, submission.work_base) - ).as_posix() + self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -513,9 +490,7 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall( - f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" - ) + self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -550,9 +525,7 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk( - file_name, topdown=False, followlinks=True - ): + for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): if not files: directory_list.append(root) for name in files: @@ -562,12 +535,8 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [ - os.path.relpath(ii, start=work_path) for ii in abs_file_list - ] - self._walk_directory( - rel_file_list, work_path, file_list, directory_list - ) + rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] + self._walk_directory(rel_file_list, work_path, file_list, directory_list) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -602,9 +571,7 @@ def upload( file_list, directory_list, ) - self._walk_directory( - submission.forward_common_files, self.local_root, file_list, directory_list - ) + self._walk_directory(submission.forward_common_files, self.local_root, file_list, directory_list) # convert to relative path to local_root directory_list = [os.path.relpath(jj, self.local_root) for jj in directory_list] @@ -616,14 +583,10 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath( - os.path.relpath(jj, self.local_root) - ).as_posix() + jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote - sha256_file = os.path.join( - self.remote_root, ".tmp.sha256." + str(uuid.uuid4()) - ) + sha256_file = os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4())) self.write_file(sha256_file, "\n".join(sha256_list)) # check sha256 # `:` means pass: https://stackoverflow.com/a/2421592/9567349 @@ -650,9 +613,7 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath( - os.path.join(remote_dir, entry.filename) - ).as_posix() + remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -681,23 +642,16 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list ] else: - rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() - ] + rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -721,23 +675,14 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_errors - ] + rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: - self._get_files( - file_list, tar_compress=self.remote_profile.get("tar_compress", None) - ) + self._get_files(file_list, tar_compress=self.remote_profile.get("tar_compress", None)) def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): """Run command with arguments. Wait for command to complete. If the return code @@ -758,9 +703,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -777,9 +720,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -793,19 +734,21 @@ def write_file(self, fname, write_str): fname = pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() # to prevent old file from being overwritten but cancelled, create a temporary file first # when it is fully written, rename it to the original file name - with self.sftp.open(fname + "~", "w") as fp: - fp.write(write_str) + temp_fname = fname + "_tmp" + try: + with self.sftp.open(temp_fname, "w") as fp: + fp.write(write_str) + # Rename the temporary file + self.block_checkcall(f"mv {shlex.quote(temp_fname)} {shlex.quote(fname)}") # sftp.rename may throw OSError - self.block_checkcall( - "mv {} {}".format(shlex.quote(fname + "~"), shlex.quote(fname)) - ) + except OSError as e: + print(f"Error writing to file {fname}") + raise e def read_file(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() - with self.sftp.open( - pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r" - ) as fp: + with self.sftp.open(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r") as fp: ret = fp.read().decode("utf-8") return ret @@ -813,9 +756,7 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat( - pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() - ) + self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) ret = True except OSError: ret = False @@ -945,36 +886,24 @@ def _get_files(self, files, tar_compress=True): per_nfile = 100 ntar = len(files) // per_nfile + 1 if ntar <= 1: - try: - self.block_checkcall( - "tar {} {} {}".format( - tar_command, - shlex.quote(of), - " ".join([shlex.quote(file) for file in files]), - ) - ) - except RuntimeError as e: - if "No such file or directory" in str(e): - raise FileNotFoundError( - "Any of the backward files does not exist in the remote directory." - ) from e - raise e + file_list = " ".join([shlex.quote(file) for file in files]) + tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = os.path.join( - self.remote_root, ".tmp.tar." + str(uuid.uuid4()) - ) + file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f"tmp_tar_{uuid.uuid4()}")).as_posix() self.write_file(file_list_file, "\n".join(files)) - try: - self.block_checkcall( - f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" - ) - except RuntimeError as e: - if "No such file or directory" in str(e): - raise FileNotFoundError( - "Any of the backward files does not exist in the remote directory." - ) from e - raise e - # trans + # if not os.path.exists(file_list_file): + # raise FileNotFoundError(f"File list was not created at {file_list_file}") + tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + + # Execute the tar command remotely + try: + self.block_checkcall(tar_cmd) + except RuntimeError as e: + if "No such file or directory" in str(e): + raise FileNotFoundError("Backward files do not exist in the remote directory.") from e + raise e + + # Transfer the archive from remote to local from_f = pathlib.PurePath(os.path.join(self.remote_root, of)).as_posix() to_f = pathlib.PurePath(os.path.join(self.local_root, of)).as_posix() if os.path.isfile(to_f): @@ -996,9 +925,7 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = ( - "The information used to maintain the connection with remote machine." - ) + doc_remote_profile = "The information used to maintain the connection with remote machine." remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile From 7bbc4857844d854a6ad7a86fef23a0b3679b61fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 06:40:57 +0000 Subject: [PATCH 33/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/contexts/ssh_context.py | 147 ++++++++++++++++++++------- dpdispatcher/machines/pbs.py | 62 ++++++++--- 2 files changed, 155 insertions(+), 54 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 39247c63..4390c0d6 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -87,7 +87,10 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) + raise RuntimeError( + "cannot connect ssh after %d failures at interval %d s" + % (max_check, sleep_time) + ) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -165,7 +168,9 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file(key_path, self.passphrase) + key = pkey_class.from_private_key_file( + key_path, self.passphrase + ) except paramiko.SSHException as e: pass if key is not None: @@ -180,7 +185,9 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") + full_path = os.path.join( + os.path.expanduser("~"), directory, f"id_{name}" + ) if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -213,7 +220,9 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError("Authentication failed, try to provide password") from key_error + raise RuntimeError( + "Authentication failed, try to provide password" + ) from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -260,7 +269,11 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: + elif ( + "verification" in pr_str + or "token" in pr_str + and self.totp_secret is not None + ): assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -308,17 +321,19 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " + "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" + " extracted from the 2D code." ) - doc_tar_compress = ( - "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_look_for_keys = ( + "enable searching for discoverable private key files in ~/.ssh/" ) - doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), Argument("username", str, optional=False, doc=doc_username), @@ -339,8 +354,12 @@ def arginfo(): doc=doc_passphrase, ), Argument("timeout", int, optional=True, default=10, doc=doc_timeout), - Argument("totp_secret", str, optional=True, default=None, doc=doc_totp_secret), - Argument("tar_compress", bool, optional=True, default=True, doc=doc_tar_compress), + Argument( + "totp_secret", str, optional=True, default=None, doc=doc_totp_secret + ), + Argument( + "tar_compress", bool, optional=True, default=True, doc=doc_tar_compress + ), Argument( "look_for_keys", bool, @@ -349,7 +368,9 @@ def arginfo(): doc=doc_look_for_keys, ), ] - ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) + ssh_remote_profile_format = Argument( + "ssh_session", dict, ssh_remote_profile_args + ) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -477,7 +498,9 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() + self.local_root = pathlib.PurePath( + os.path.join(self.temp_local_root, submission.work_base) + ).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -490,7 +513,9 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") + self.block_checkcall( + f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" + ) elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -525,7 +550,9 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): + for root, dirs, files in os.walk( + file_name, topdown=False, followlinks=True + ): if not files: directory_list.append(root) for name in files: @@ -535,8 +562,12 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] - self._walk_directory(rel_file_list, work_path, file_list, directory_list) + rel_file_list = [ + os.path.relpath(ii, start=work_path) for ii in abs_file_list + ] + self._walk_directory( + rel_file_list, work_path, file_list, directory_list + ) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -571,7 +602,9 @@ def upload( file_list, directory_list, ) - self._walk_directory(submission.forward_common_files, self.local_root, file_list, directory_list) + self._walk_directory( + submission.forward_common_files, self.local_root, file_list, directory_list + ) # convert to relative path to local_root directory_list = [os.path.relpath(jj, self.local_root) for jj in directory_list] @@ -583,10 +616,14 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() + jj_rel = pathlib.PurePath( + os.path.relpath(jj, self.local_root) + ).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote - sha256_file = os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4())) + sha256_file = os.path.join( + self.remote_root, ".tmp.sha256." + str(uuid.uuid4()) + ) self.write_file(sha256_file, "\n".join(sha256_list)) # check sha256 # `:` means pass: https://stackoverflow.com/a/2421592/9567349 @@ -613,7 +650,9 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() + remote_name = pathlib.PurePath( + os.path.join(remote_dir, entry.filename) + ).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -642,16 +681,23 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_file_list ] else: - rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] + rel_file_list = [ + pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() + ] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -675,14 +721,23 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] + rel_errors = [ + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_errors + ] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: - self._get_files(file_list, tar_compress=self.remote_profile.get("tar_compress", None)) + self._get_files( + file_list, tar_compress=self.remote_profile.get("tar_compress", None) + ) def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): """Run command with arguments. Wait for command to complete. If the return code @@ -703,7 +758,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -720,7 +777,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -748,7 +807,9 @@ def write_file(self, fname, write_str): def read_file(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() - with self.sftp.open(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r") as fp: + with self.sftp.open( + pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r" + ) as fp: ret = fp.read().decode("utf-8") return ret @@ -756,7 +817,9 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) + self.sftp.stat( + pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() + ) ret = True except OSError: ret = False @@ -889,18 +952,24 @@ def _get_files(self, files, tar_compress=True): file_list = " ".join([shlex.quote(file) for file in files]) tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f"tmp_tar_{uuid.uuid4()}")).as_posix() + file_list_file = pathlib.PurePath( + os.path.join(self.remote_root, f"tmp_tar_{uuid.uuid4()}") + ).as_posix() self.write_file(file_list_file, "\n".join(files)) # if not os.path.exists(file_list_file): # raise FileNotFoundError(f"File list was not created at {file_list_file}") - tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + tar_cmd = ( + f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + ) # Execute the tar command remotely try: self.block_checkcall(tar_cmd) except RuntimeError as e: if "No such file or directory" in str(e): - raise FileNotFoundError("Backward files do not exist in the remote directory.") from e + raise FileNotFoundError( + "Backward files do not exist in the remote directory." + ) from e raise e # Transfer the archive from remote to local @@ -925,7 +994,9 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = "The information used to maintain the connection with remote machine." + doc_remote_profile = ( + "The information used to maintain the connection with remote machine." + ) remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index f05ce5ba..b1c319a5 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,15 +28,22 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":ngpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header def do_submit(self, job): @@ -53,7 +60,9 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) + "cd {} && {} {}".format( + shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) + ) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -77,7 +86,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -126,7 +136,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -152,15 +163,22 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":gpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header @@ -200,15 +218,24 @@ def gen_script_header(self, job): pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + sge_script_header_dict["select_node_line"] += ( + f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + ) if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + sge_script_header_dict["select_node_line"] += ( + f"#$ -q {resources.queue_name}" + ) + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format(**sge_script_header_dict) + sge_script_header = sge_script_header_template.format( + **sge_script_header_dict + ) return sge_script_header def do_submit(self, job): @@ -239,7 +266,9 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError(f"status command qstat fails to execute. erro info: {err_str} return code {ret}") + raise RuntimeError( + f"status command qstat fails to execute. erro info: {err_str} return code {ret}" + ) status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -250,7 +279,9 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info(f"not tag_finished detected, execute sync command and wait. count {count}") + dlog.info( + f"not tag_finished detected, execute sync command and wait. count {count}" + ) self.context.block_call("sync") import time @@ -305,7 +336,6 @@ def resources_subfields(cls) -> List[Argument]: default="wDPjob", doc=doc_job_name, ), - ], optional=False, doc="Extra arguments.", From 37da47c2a6fc6de52b8468a1464f9ab3ca1daf74 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Thu, 25 Jul 2024 16:40:41 +0900 Subject: [PATCH 34/76] u --- dpdispatcher/contexts/ssh_context.py | 160 ++++++++++----------------- dpdispatcher/machines/pbs.py | 62 +++-------- 2 files changed, 73 insertions(+), 149 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 4390c0d6..68719a3f 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -87,10 +87,7 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError( - "cannot connect ssh after %d failures at interval %d s" - % (max_check, sleep_time) - ) + raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -168,9 +165,7 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file( - key_path, self.passphrase - ) + key = pkey_class.from_private_key_file(key_path, self.passphrase) except paramiko.SSHException as e: pass if key is not None: @@ -185,9 +180,7 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join( - os.path.expanduser("~"), directory, f"id_{name}" - ) + full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -220,9 +213,7 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError( - "Authentication failed, try to provide password" - ) from key_error + raise RuntimeError("Authentication failed, try to provide password") from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -269,11 +260,7 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif ( - "verification" in pr_str - or "token" in pr_str - and self.totp_secret is not None - ): + elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -295,7 +282,11 @@ def exec_command(self, cmd): assert self.ssh is not None try: return self.ssh.exec_command(cmd) - except (paramiko.ssh_exception.SSHException, socket.timeout, EOFError) as e: + except ( + paramiko.ssh_exception.SSHException, + socket.timeout, + EOFError, + ) as e: # SSH session not active # retry for up to 3 times # ensure alive @@ -321,19 +312,17 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " - "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" - " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." ) - doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." - doc_look_for_keys = ( - "enable searching for discoverable private key files in ~/.ssh/" + doc_tar_compress = ( + "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." ) + doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), Argument("username", str, optional=False, doc=doc_username), @@ -355,10 +344,18 @@ def arginfo(): ), Argument("timeout", int, optional=True, default=10, doc=doc_timeout), Argument( - "totp_secret", str, optional=True, default=None, doc=doc_totp_secret + "totp_secret", + str, + optional=True, + default=None, + doc=doc_totp_secret, ), Argument( - "tar_compress", bool, optional=True, default=True, doc=doc_tar_compress + "tar_compress", + bool, + optional=True, + default=True, + doc=doc_tar_compress, ), Argument( "look_for_keys", @@ -368,9 +365,7 @@ def arginfo(): doc=doc_look_for_keys, ), ] - ssh_remote_profile_format = Argument( - "ssh_session", dict, ssh_remote_profile_args - ) + ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -498,9 +493,7 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath( - os.path.join(self.temp_local_root, submission.work_base) - ).as_posix() + self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -513,9 +506,7 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall( - f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" - ) + self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -550,9 +541,7 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk( - file_name, topdown=False, followlinks=True - ): + for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): if not files: directory_list.append(root) for name in files: @@ -562,12 +551,8 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [ - os.path.relpath(ii, start=work_path) for ii in abs_file_list - ] - self._walk_directory( - rel_file_list, work_path, file_list, directory_list - ) + rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] + self._walk_directory(rel_file_list, work_path, file_list, directory_list) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -603,7 +588,10 @@ def upload( directory_list, ) self._walk_directory( - submission.forward_common_files, self.local_root, file_list, directory_list + submission.forward_common_files, + self.local_root, + file_list, + directory_list, ) # convert to relative path to local_root @@ -616,14 +604,10 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath( - os.path.relpath(jj, self.local_root) - ).as_posix() + jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote - sha256_file = os.path.join( - self.remote_root, ".tmp.sha256." + str(uuid.uuid4()) - ) + sha256_file = os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4())) self.write_file(sha256_file, "\n".join(sha256_list)) # check sha256 # `:` means pass: https://stackoverflow.com/a/2421592/9567349 @@ -650,9 +634,7 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath( - os.path.join(remote_dir, entry.filename) - ).as_posix() + remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -681,23 +663,16 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list ] else: - rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() - ] + rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -721,22 +696,16 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_errors - ] + rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: self._get_files( - file_list, tar_compress=self.remote_profile.get("tar_compress", None) + file_list, + tar_compress=self.remote_profile.get("tar_compress", None), ) def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): @@ -758,9 +727,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -777,9 +744,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -801,14 +766,15 @@ def write_file(self, fname, write_str): self.block_checkcall(f"mv {shlex.quote(temp_fname)} {shlex.quote(fname)}") # sftp.rename may throw OSError except OSError as e: - print(f"Error writing to file {fname}") + dlog.exception(f"Error writing to file {fname}") raise e def read_file(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() with self.sftp.open( - pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), "r" + pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix(), + "r", ) as fp: ret = fp.read().decode("utf-8") return ret @@ -817,9 +783,7 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat( - pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() - ) + self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) ret = True except OSError: ret = False @@ -952,24 +916,16 @@ def _get_files(self, files, tar_compress=True): file_list = " ".join([shlex.quote(file) for file in files]) tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = pathlib.PurePath( - os.path.join(self.remote_root, f"tmp_tar_{uuid.uuid4()}") - ).as_posix() + file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")).as_posix() self.write_file(file_list_file, "\n".join(files)) - # if not os.path.exists(file_list_file): - # raise FileNotFoundError(f"File list was not created at {file_list_file}") - tar_cmd = ( - f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" - ) + tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" # Execute the tar command remotely try: self.block_checkcall(tar_cmd) except RuntimeError as e: if "No such file or directory" in str(e): - raise FileNotFoundError( - "Backward files do not exist in the remote directory." - ) from e + raise FileNotFoundError("Backward files do not exist in the remote directory.") from e raise e # Transfer the archive from remote to local @@ -994,9 +950,7 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = ( - "The information used to maintain the connection with remote machine." - ) + doc_remote_profile = "The information used to maintain the connection with remote machine." remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index b1c319a5..c2b64254 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,22 +28,15 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":ngpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header def do_submit(self, job): @@ -60,9 +53,7 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format( - shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) - ) + "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -86,8 +77,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -136,8 +126,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -163,22 +152,15 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":gpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header @@ -218,24 +200,15 @@ def gen_script_header(self, job): pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += ( - f"#$ -pe {pe_name} {resources.cpu_per_node}\n" - ) + sge_script_header_dict["select_node_line"] += f"#$ -pe {pe_name} {resources.cpu_per_node}\n" if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += ( - f"#$ -q {resources.queue_name}" - ) - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" + if resources["strategy"].get("customized_script_header_template_file") is not None: file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format( - **sge_script_header_dict - ) + sge_script_header = sge_script_header_template.format(**sge_script_header_dict) return sge_script_header def do_submit(self, job): @@ -266,9 +239,7 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError( - f"status command qstat fails to execute. erro info: {err_str} return code {ret}" - ) + raise RuntimeError(f"status command qstat fails to execute. erro info: {err_str} return code {ret}") status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -279,9 +250,7 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info( - f"not tag_finished detected, execute sync command and wait. count {count}" - ) + dlog.info(f"not tag_finished detected, execute sync command and wait. count {count}") self.context.block_call("sync") import time @@ -328,6 +297,7 @@ def resources_subfields(cls) -> List[Argument]: optional=True, default="mpi", doc=doc_pe_name, + alias=["sge_pe_name"], ), Argument( "job_name", From bca9728122a88f684f31cc7b9d155c2a29fda0d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 07:40:52 +0000 Subject: [PATCH 35/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/contexts/ssh_context.py | 127 ++++++++++++++++++++------- dpdispatcher/machines/pbs.py | 61 +++++++++---- 2 files changed, 140 insertions(+), 48 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 68719a3f..5d9bb1de 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -87,7 +87,10 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) + raise RuntimeError( + "cannot connect ssh after %d failures at interval %d s" + % (max_check, sleep_time) + ) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -165,7 +168,9 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file(key_path, self.passphrase) + key = pkey_class.from_private_key_file( + key_path, self.passphrase + ) except paramiko.SSHException as e: pass if key is not None: @@ -180,7 +185,9 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") + full_path = os.path.join( + os.path.expanduser("~"), directory, f"id_{name}" + ) if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -213,7 +220,9 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError("Authentication failed, try to provide password") from key_error + raise RuntimeError( + "Authentication failed, try to provide password" + ) from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -260,7 +269,11 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: + elif ( + "verification" in pr_str + or "token" in pr_str + and self.totp_secret is not None + ): assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -312,17 +325,19 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " + "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" + " extracted from the 2D code." ) - doc_tar_compress = ( - "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_look_for_keys = ( + "enable searching for discoverable private key files in ~/.ssh/" ) - doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), Argument("username", str, optional=False, doc=doc_username), @@ -365,7 +380,9 @@ def arginfo(): doc=doc_look_for_keys, ), ] - ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) + ssh_remote_profile_format = Argument( + "ssh_session", dict, ssh_remote_profile_args + ) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -493,7 +510,9 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() + self.local_root = pathlib.PurePath( + os.path.join(self.temp_local_root, submission.work_base) + ).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -506,7 +525,9 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") + self.block_checkcall( + f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" + ) elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -541,7 +562,9 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): + for root, dirs, files in os.walk( + file_name, topdown=False, followlinks=True + ): if not files: directory_list.append(root) for name in files: @@ -551,8 +574,12 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] - self._walk_directory(rel_file_list, work_path, file_list, directory_list) + rel_file_list = [ + os.path.relpath(ii, start=work_path) for ii in abs_file_list + ] + self._walk_directory( + rel_file_list, work_path, file_list, directory_list + ) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -604,10 +631,14 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() + jj_rel = pathlib.PurePath( + os.path.relpath(jj, self.local_root) + ).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote - sha256_file = os.path.join(self.remote_root, ".tmp.sha256." + str(uuid.uuid4())) + sha256_file = os.path.join( + self.remote_root, ".tmp.sha256." + str(uuid.uuid4()) + ) self.write_file(sha256_file, "\n".join(sha256_list)) # check sha256 # `:` means pass: https://stackoverflow.com/a/2421592/9567349 @@ -634,7 +665,9 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() + remote_name = pathlib.PurePath( + os.path.join(remote_dir, entry.filename) + ).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -663,16 +696,23 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_file_list ] else: - rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] + rel_file_list = [ + pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() + ] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -696,10 +736,17 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] + rel_errors = [ + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_errors + ] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: @@ -727,7 +774,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -744,7 +793,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -783,7 +834,9 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) + self.sftp.stat( + pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() + ) ret = True except OSError: ret = False @@ -916,16 +969,22 @@ def _get_files(self, files, tar_compress=True): file_list = " ".join([shlex.quote(file) for file in files]) tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")).as_posix() + file_list_file = pathlib.PurePath( + os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}") + ).as_posix() self.write_file(file_list_file, "\n".join(files)) - tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + tar_cmd = ( + f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + ) # Execute the tar command remotely try: self.block_checkcall(tar_cmd) except RuntimeError as e: if "No such file or directory" in str(e): - raise FileNotFoundError("Backward files do not exist in the remote directory.") from e + raise FileNotFoundError( + "Backward files do not exist in the remote directory." + ) from e raise e # Transfer the archive from remote to local @@ -950,7 +1009,9 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = "The information used to maintain the connection with remote machine." + doc_remote_profile = ( + "The information used to maintain the connection with remote machine." + ) remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index c2b64254..e0fbe97e 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,15 +28,22 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":ngpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header def do_submit(self, job): @@ -53,7 +60,9 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) + "cd {} && {} {}".format( + shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) + ) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -77,7 +86,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -126,7 +136,8 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" + % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -152,15 +163,22 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" + pbs_script_header_dict["select_node_line"] += ( + f":gpus={resources.gpu_per_node}" + ) pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) + pbs_script_header = pbs_script_header_template.format( + **pbs_script_header_dict + ) return pbs_script_header @@ -200,15 +218,24 @@ def gen_script_header(self, job): pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + sge_script_header_dict["select_node_line"] += ( + f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + ) if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" - if resources["strategy"].get("customized_script_header_template_file") is not None: + sge_script_header_dict["select_node_line"] += ( + f"#$ -q {resources.queue_name}" + ) + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format(**sge_script_header_dict) + sge_script_header = sge_script_header_template.format( + **sge_script_header_dict + ) return sge_script_header def do_submit(self, job): @@ -239,7 +266,9 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError(f"status command qstat fails to execute. erro info: {err_str} return code {ret}") + raise RuntimeError( + f"status command qstat fails to execute. erro info: {err_str} return code {ret}" + ) status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -250,7 +279,9 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info(f"not tag_finished detected, execute sync command and wait. count {count}") + dlog.info( + f"not tag_finished detected, execute sync command and wait. count {count}" + ) self.context.block_call("sync") import time From c13be3b49d62e9aacc086b7cbf809db640491e7b Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Sun, 28 Jul 2024 02:08:29 +0900 Subject: [PATCH 36/76] Update SSHcontext: add execute command --- dpdispatcher/contexts/ssh_context.py | 135 +++++++++------------------ 1 file changed, 44 insertions(+), 91 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index aed7b299..eea0547e 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -44,6 +44,7 @@ def __init__( totp_secret=None, tar_compress=True, look_for_keys=True, + execute_command=None, ): self.hostname = hostname self.username = username @@ -56,6 +57,7 @@ def __init__( self.ssh = None self.tar_compress = tar_compress self.look_for_keys = look_for_keys + self.execute_command = execute_command self._keyboard_interactive_auth = False self._setup_ssh() @@ -87,10 +89,7 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError( - "cannot connect ssh after %d failures at interval %d s" - % (max_check, sleep_time) - ) + raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -168,9 +167,7 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file( - key_path, self.passphrase - ) + key = pkey_class.from_private_key_file(key_path, self.passphrase) except paramiko.SSHException as e: pass if key is not None: @@ -185,9 +182,7 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join( - os.path.expanduser("~"), directory, f"id_{name}" - ) + full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -220,9 +215,7 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError( - "Authentication failed, try to provide password" - ) from key_error + raise RuntimeError("Authentication failed, try to provide password") from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -237,6 +230,8 @@ def _setup_ssh(self): self.ssh._transport = ts # type: ignore # reset sftp self._sftp = None + if self.execute_command is not None: + self.exec_command(self.execute_command) def inter_handler(self, title, instructions, prompt_list): """inter_handler: the callback for paramiko.transport.auth_interactive. @@ -269,11 +264,7 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif ( - "verification" in pr_str - or "token" in pr_str - and self.totp_secret is not None - ): + elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -325,19 +316,18 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " - "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" - " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." ) - doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." - doc_look_for_keys = ( - "enable searching for discoverable private key files in ~/.ssh/" + doc_tar_compress = ( + "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." ) + doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" + doc_execute_command = "execute command after ssh connection is established." ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), Argument("username", str, optional=False, doc=doc_username), @@ -379,10 +369,15 @@ def arginfo(): default=True, doc=doc_look_for_keys, ), + Argument( + "execute_command", + str, + optional=True, + default=None, + doc=doc_execute_command, + ), ] - ssh_remote_profile_format = Argument( - "ssh_session", dict, ssh_remote_profile_args - ) + ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -510,9 +505,7 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath( - os.path.join(self.temp_local_root, submission.work_base) - ).as_posix() + self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -525,9 +518,7 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall( - f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" - ) + self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -562,9 +553,7 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk( - file_name, topdown=False, followlinks=True - ): + for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): if not files: directory_list.append(root) for name in files: @@ -574,12 +563,8 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [ - os.path.relpath(ii, start=work_path) for ii in abs_file_list - ] - self._walk_directory( - rel_file_list, work_path, file_list, directory_list - ) + rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] + self._walk_directory(rel_file_list, work_path, file_list, directory_list) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -631,9 +616,7 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath( - os.path.relpath(jj, self.local_root) - ).as_posix() + jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote sha256_file = pathlib.PurePath( @@ -665,9 +648,7 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath( - os.path.join(remote_dir, entry.filename) - ).as_posix() + remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -696,23 +677,16 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list ] else: - rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() - ] + rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -736,17 +710,10 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath( - os.path.join(self.remote_root, ii.task_work_path) - ).as_posix() - self.list_remote_dir( - self.sftp, remote_job, remote_job, remote_file_list - ) + remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() + self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() - for kk in abs_errors - ] + rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: @@ -774,9 +741,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -793,9 +758,7 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command( - (f"cd {shlex.quote(self.remote_root)} ;") + cmd - ) + stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -834,9 +797,7 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat( - pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() - ) + self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) ret = True except OSError: ret = False @@ -969,22 +930,16 @@ def _get_files(self, files, tar_compress=True): file_list = " ".join([shlex.quote(file) for file in files]) tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = pathlib.PurePath( - os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}") - ).as_posix() + file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")).as_posix() self.write_file(file_list_file, "\n".join(files)) - tar_cmd = ( - f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" - ) + tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" # Execute the tar command remotely try: self.block_checkcall(tar_cmd) except RuntimeError as e: if "No such file or directory" in str(e): - raise FileNotFoundError( - "Backward files do not exist in the remote directory." - ) from e + raise FileNotFoundError("Backward files do not exist in the remote directory.") from e raise e # Transfer the archive from remote to local @@ -1009,9 +964,7 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = ( - "The information used to maintain the connection with remote machine." - ) + doc_remote_profile = "The information used to maintain the connection with remote machine." remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile From a9fbadfe73ce2592d90f8b70942cddb9f700aa71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 27 Jul 2024 19:08:57 +0000 Subject: [PATCH 37/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/contexts/ssh_context.py | 123 ++++++++++++++++++++------- 1 file changed, 91 insertions(+), 32 deletions(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index eea0547e..6e80c033 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -89,7 +89,10 @@ def ensure_alive(self, max_check=10, sleep_time=10): count = 1 while not self._check_alive(): if count == max_check: - raise RuntimeError("cannot connect ssh after %d failures at interval %d s" % (max_check, sleep_time)) + raise RuntimeError( + "cannot connect ssh after %d failures at interval %d s" + % (max_check, sleep_time) + ) dlog.info("connection check failed, try to reconnect to " + self.hostname) self._setup_ssh() count += 1 @@ -167,7 +170,9 @@ def _setup_ssh(self): ): try: # passing empty passphrase would not raise error. - key = pkey_class.from_private_key_file(key_path, self.passphrase) + key = pkey_class.from_private_key_file( + key_path, self.passphrase + ) except paramiko.SSHException as e: pass if key is not None: @@ -182,7 +187,9 @@ def _setup_ssh(self): (paramiko.Ed25519Key, "ed25519"), ]: for directory in [".ssh", "ssh"]: - full_path = os.path.join(os.path.expanduser("~"), directory, f"id_{name}") + full_path = os.path.join( + os.path.expanduser("~"), directory, f"id_{name}" + ) if os.path.isfile(full_path): keyfiles.append((keytype, full_path)) # TODO: supporting cert @@ -215,7 +222,9 @@ def _setup_ssh(self): elif self.password is not None: ts.auth_password(self.username, self.password) elif key_error is not None: - raise RuntimeError("Authentication failed, try to provide password") from key_error + raise RuntimeError( + "Authentication failed, try to provide password" + ) from key_error else: raise RuntimeError("Please provide at least one form of authentication") assert ts.is_active() @@ -264,7 +273,11 @@ def inter_handler(self, title, instructions, prompt_list): resp.append(self.username) elif "password" in pr_str: resp.append(self.password) - elif "verification" in pr_str or "token" in pr_str and self.totp_secret is not None: + elif ( + "verification" in pr_str + or "token" in pr_str + and self.totp_secret is not None + ): assert self.totp_secret is not None resp.append(generate_totp(self.totp_secret)) @@ -316,17 +329,19 @@ def arginfo(): ) doc_port = "ssh connection port." doc_key_filename = ( - "key filename used by ssh connection. If left None, find key in ~/.ssh or " "use password for login" + "key filename used by ssh connection. If left None, find key in ~/.ssh or " + "use password for login" ) doc_passphrase = "passphrase of key used by ssh connection" doc_timeout = "timeout of ssh connection" doc_totp_secret = ( - "Time-based one time password secret. It should be a base32-encoded string" " extracted from the 2D code." + "Time-based one time password secret. It should be a base32-encoded string" + " extracted from the 2D code." ) - doc_tar_compress = ( - "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_tar_compress = "The archive will be compressed in upload and download if it is True. If not, compression will be skipped." + doc_look_for_keys = ( + "enable searching for discoverable private key files in ~/.ssh/" ) - doc_look_for_keys = "enable searching for discoverable private key files in ~/.ssh/" doc_execute_command = "execute command after ssh connection is established." ssh_remote_profile_args = [ Argument("hostname", str, optional=False, doc=doc_hostname), @@ -377,7 +392,9 @@ def arginfo(): doc=doc_execute_command, ), ] - ssh_remote_profile_format = Argument("ssh_session", dict, ssh_remote_profile_args) + ssh_remote_profile_format = Argument( + "ssh_session", dict, ssh_remote_profile_args + ) return ssh_remote_profile_format def put(self, from_f, to_f): @@ -505,7 +522,9 @@ def bind_submission(self, submission): assert self.ssh_session is not None assert self.ssh_session.ssh is not None self.submission = submission - self.local_root = pathlib.PurePath(os.path.join(self.temp_local_root, submission.work_base)).as_posix() + self.local_root = pathlib.PurePath( + os.path.join(self.temp_local_root, submission.work_base) + ).as_posix() old_remote_root = self.remote_root # self.remote_root = os.path.join(self.temp_remote_root, self.submission.submission_hash, self.submission.work_base ) self.remote_root = pathlib.PurePath( @@ -518,7 +537,9 @@ def bind_submission(self, submission): and self.check_file_exists(old_remote_root) and not self.check_file_exists(self.remote_root) ): - self.block_checkcall(f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}") + self.block_checkcall( + f"mv {shlex.quote(old_remote_root)} {shlex.quote(self.remote_root)}" + ) elif ( old_remote_root is not None and old_remote_root != self.remote_root @@ -553,7 +574,9 @@ def _walk_directory(self, files, work_path, file_list, directory_list): if os.path.isfile(file_name): file_list.append(file_name) elif os.path.isdir(file_name): - for root, dirs, files in os.walk(file_name, topdown=False, followlinks=True): + for root, dirs, files in os.walk( + file_name, topdown=False, followlinks=True + ): if not files: directory_list.append(root) for name in files: @@ -563,8 +586,12 @@ def _walk_directory(self, files, work_path, file_list, directory_list): elif glob(file_name): # If the file name contains a wildcard, os.path functions will fail to identify it. Use glob to get the complete list of filenames which match the wildcard. abs_file_list = glob(file_name) - rel_file_list = [os.path.relpath(ii, start=work_path) for ii in abs_file_list] - self._walk_directory(rel_file_list, work_path, file_list, directory_list) + rel_file_list = [ + os.path.relpath(ii, start=work_path) for ii in abs_file_list + ] + self._walk_directory( + rel_file_list, work_path, file_list, directory_list + ) else: raise FileNotFoundError(f"cannot find upload file {work_path} {jj}") @@ -616,7 +643,9 @@ def upload( sha256_list = [] for jj in file_list: sha256 = get_sha256(jj) - jj_rel = pathlib.PurePath(os.path.relpath(jj, self.local_root)).as_posix() + jj_rel = pathlib.PurePath( + os.path.relpath(jj, self.local_root) + ).as_posix() sha256_list.append(f"{sha256} {jj_rel}") # write to remote sha256_file = pathlib.PurePath( @@ -648,7 +677,9 @@ def upload( def list_remote_dir(self, sftp, remote_dir, ref_remote_root, result_list): for entry in sftp.listdir_attr(remote_dir): - remote_name = pathlib.PurePath(os.path.join(remote_dir, entry.filename)).as_posix() + remote_name = pathlib.PurePath( + os.path.join(remote_dir, entry.filename) + ).as_posix() st_mode = entry.st_mode if S_ISDIR(st_mode): self.list_remote_dir(sftp, remote_name, ref_remote_root, result_list) @@ -677,16 +708,23 @@ def download( abs_file_list = fnmatch.filter(remote_file_list, jj) else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_file_list = fnmatch.filter(remote_file_list, jj) rel_file_list = [ - pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_file_list + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_file_list ] else: - rel_file_list = [pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix()] + rel_file_list = [ + pathlib.PurePath(os.path.join(ii.task_work_path, jj)).as_posix() + ] if check_exists: for file_name in rel_file_list: if self.check_file_exists(file_name): @@ -710,10 +748,17 @@ def download( abs_errors = fnmatch.filter(remote_file_list, "error*") else: remote_file_list = [] - remote_job = pathlib.PurePath(os.path.join(self.remote_root, ii.task_work_path)).as_posix() - self.list_remote_dir(self.sftp, remote_job, remote_job, remote_file_list) + remote_job = pathlib.PurePath( + os.path.join(self.remote_root, ii.task_work_path) + ).as_posix() + self.list_remote_dir( + self.sftp, remote_job, remote_job, remote_file_list + ) abs_errors = fnmatch.filter(remote_file_list, "error*") - rel_errors = [pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() for kk in abs_errors] + rel_errors = [ + pathlib.PurePath(os.path.join(ii.task_work_path, kk)).as_posix() + for kk in abs_errors + ] file_list.extend(rel_errors) file_list.extend(submission.backward_common_files) if len(file_list) > 0: @@ -741,7 +786,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): self.ssh_session.ensure_alive() if asynchronously: cmd = f"nohup {cmd} >/dev/null &" - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: raise RuntimeError( @@ -758,7 +805,9 @@ def block_checkcall(self, cmd, asynchronously=False, stderr_whitelist=None): def block_call(self, cmd): assert self.remote_root is not None self.ssh_session.ensure_alive() - stdin, stdout, stderr = self.ssh_session.exec_command((f"cd {shlex.quote(self.remote_root)} ;") + cmd) + stdin, stdout, stderr = self.ssh_session.exec_command( + (f"cd {shlex.quote(self.remote_root)} ;") + cmd + ) exit_status = stdout.channel.recv_exit_status() return exit_status, stdin, stdout, stderr @@ -797,7 +846,9 @@ def check_file_exists(self, fname): assert self.remote_root is not None self.ssh_session.ensure_alive() try: - self.sftp.stat(pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix()) + self.sftp.stat( + pathlib.PurePath(os.path.join(self.remote_root, fname)).as_posix() + ) ret = True except OSError: ret = False @@ -930,16 +981,22 @@ def _get_files(self, files, tar_compress=True): file_list = " ".join([shlex.quote(file) for file in files]) tar_cmd = f"tar {tar_command} {shlex.quote(of)} {file_list}" else: - file_list_file = pathlib.PurePath(os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}")).as_posix() + file_list_file = pathlib.PurePath( + os.path.join(self.remote_root, f".tmp_tar_{uuid.uuid4()}") + ).as_posix() self.write_file(file_list_file, "\n".join(files)) - tar_cmd = f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + tar_cmd = ( + f"tar {tar_command} {shlex.quote(of)} -T {shlex.quote(file_list_file)}" + ) # Execute the tar command remotely try: self.block_checkcall(tar_cmd) except RuntimeError as e: if "No such file or directory" in str(e): - raise FileNotFoundError("Backward files do not exist in the remote directory.") from e + raise FileNotFoundError( + "Backward files do not exist in the remote directory." + ) from e raise e # Transfer the archive from remote to local @@ -964,7 +1021,9 @@ def machine_subfields(cls) -> List[Argument]: list[Argument] machine subfields """ - doc_remote_profile = "The information used to maintain the connection with remote machine." + doc_remote_profile = ( + "The information used to maintain the connection with remote machine." + ) remote_profile_format = SSHSession.arginfo() remote_profile_format.name = "remote_profile" remote_profile_format.doc = doc_remote_profile From fba957479b6effce0bf0b69af1005146e4ee04ab Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:10:24 +0900 Subject: [PATCH 38/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 68 +++++++++++------------------------- 1 file changed, 20 insertions(+), 48 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index e0fbe97e..8699f39b 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -28,22 +28,15 @@ def gen_script_header(self, job): f"#PBS -l select={resources.number_node}:ncpus={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":ngpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":ngpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header def do_submit(self, job): @@ -60,9 +53,7 @@ def do_submit(self, job): script_file_dir = self.context.remote_root # stdin, stdout, stderr = self.context.block_checkcall('cd %s && %s %s' % (self.context.remote_root, 'qsub', script_file_name)) stdin, stdout, stderr = self.context.block_checkcall( - "cd {} && {} {}".format( - shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name) - ) + "cd {} && {} {}".format(shlex.quote(script_file_dir), "qsub", shlex.quote(script_file_name)) ) subret = stdout.readlines() job_id = subret[0].split()[0] @@ -86,8 +77,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -136,8 +126,7 @@ def check_status(self, job): return JobStatus.terminated else: raise RuntimeError( - "status command qstat fails to execute. erro info: %s return code %d" - % (err_str, ret) + "status command qstat fails to execute. erro info: %s return code %d" % (err_str, ret) ) status_line = stdout.read().decode("utf-8").split("\n")[-2] status_word = status_line.split()[-2] @@ -163,22 +152,15 @@ def gen_script_header(self, job): f"#PBS -l nodes={resources.number_node}:ppn={resources.cpu_per_node}" ) if resources.gpu_per_node != 0: - pbs_script_header_dict["select_node_line"] += ( - f":gpus={resources.gpu_per_node}" - ) + pbs_script_header_dict["select_node_line"] += f":gpus={resources.gpu_per_node}" pbs_script_header_dict["queue_name_line"] = f"#PBS -q {resources.queue_name}" - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + if resources["strategy"].get("customized_script_header_template_file") is not None: pbs_script_header = customized_script_header_template( resources["strategy"]["customized_script_header_template_file"], resources, ) else: - pbs_script_header = pbs_script_header_template.format( - **pbs_script_header_dict - ) + pbs_script_header = pbs_script_header_template.format(**pbs_script_header_dict) return pbs_script_header @@ -218,24 +200,15 @@ def gen_script_header(self, job): pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += ( - f"#$ -pe {pe_name} {resources.cpu_per_node}\n" - ) + sge_script_header_dict["select_node_line"] += f"#$ -pe {pe_name} {resources.cpu_per_node}\n" if resources.queue_name != "": - sge_script_header_dict["select_node_line"] += ( - f"#$ -q {resources.queue_name}" - ) - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): + sge_script_header_dict["select_node_line"] += f"#$ -q {resources.queue_name}" + if resources["strategy"].get("customized_script_header_template_file") is not None: file_name = resources["strategy"]["customized_script_header_template_file"] sge_script_header = customized_script_header_template(file_name, resources) else: - sge_script_header = sge_script_header_template.format( - **sge_script_header_dict - ) + sge_script_header = sge_script_header_template.format(**sge_script_header_dict) return sge_script_header def do_submit(self, job): @@ -259,6 +232,7 @@ def default_resources(self, resources): pass def check_status(self, job): + ### https://softpanorama.org/HPC/Grid_engine/Queues/queue_states.shtml job_id = job.job_id status_line = None if job_id == "": @@ -266,9 +240,7 @@ def check_status(self, job): ret, stdin, stdout, stderr = self.context.block_call("qstat") err_str = stderr.read().decode("utf-8") if ret != 0: - raise RuntimeError( - f"status command qstat fails to execute. erro info: {err_str} return code {ret}" - ) + raise RuntimeError(f"status command qstat fails to execute. erro info: {err_str} return code {ret}") status_text_list = stdout.read().decode("utf-8").split("\n") for txt in status_text_list: if job_id in txt: @@ -279,9 +251,7 @@ def check_status(self, job): while count <= 6: if self.check_finish_tag(job=job): return JobStatus.finished - dlog.info( - f"not tag_finished detected, execute sync command and wait. count {count}" - ) + dlog.info(f"not tag_finished detected, execute sync command and wait. count {count}") self.context.block_call("sync") import time @@ -291,10 +261,12 @@ def check_status(self, job): else: status_word = status_line.split()[4] # dlog.info (status_word) - if status_word in ["qw"]: + if status_word in ["qw", "hqw", "t"]: return JobStatus.waiting - elif status_word in ["r"]: + elif status_word in ["r", "Rr"]: return JobStatus.running + elif status_word in ["Eqw", "dr", "dt"]: + return JobStatus.terminated else: return JobStatus.unknown From 5f42bc84c8f9a69d4881dc3beeaa6fa671995563 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Mon, 2 Sep 2024 17:57:29 +0900 Subject: [PATCH 39/76] Update dlog.py --- dpdispatcher/dlog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/dlog.py b/dpdispatcher/dlog.py index c92babcc..75c0f2d1 100644 --- a/dpdispatcher/dlog.py +++ b/dpdispatcher/dlog.py @@ -6,7 +6,7 @@ dlog = logging.getLogger("dpdispatcher") dlog.propagate = False dlog.setLevel(logging.INFO) -cwd_logfile_path = os.path.join(os.getcwd(), "dpdispatcher.log") +cwd_logfile_path = os.path.join(os.getcwd(), "logs/dpdispatcher.log") dlogf = logging.FileHandler(cwd_logfile_path, delay=True) try: dlog.addHandler(dlogf) From bfe1615a1d2c707b36b815f3a3fcba1f38ea63e6 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Tue, 3 Sep 2024 03:11:58 +0900 Subject: [PATCH 40/76] Update dlog.py --- dpdispatcher/dlog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/dlog.py b/dpdispatcher/dlog.py index 75c0f2d1..c92babcc 100644 --- a/dpdispatcher/dlog.py +++ b/dpdispatcher/dlog.py @@ -6,7 +6,7 @@ dlog = logging.getLogger("dpdispatcher") dlog.propagate = False dlog.setLevel(logging.INFO) -cwd_logfile_path = os.path.join(os.getcwd(), "logs/dpdispatcher.log") +cwd_logfile_path = os.path.join(os.getcwd(), "dpdispatcher.log") dlogf = logging.FileHandler(cwd_logfile_path, delay=True) try: dlog.addHandler(dlogf) From 1f514de64b13925f0ec3ee16242e32997df26e0f Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Sun, 6 Oct 2024 21:49:43 +0900 Subject: [PATCH 41/76] Update ssh_context.py --- dpdispatcher/contexts/ssh_context.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 8537894d..8586c2fa 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -450,7 +450,9 @@ def __init__( self.init_local_root = local_root self.init_remote_root = remote_root self.temp_local_root = os.path.abspath(local_root) - assert os.path.isabs(remote_root), "remote_root must be a abspath" + assert os.path.isabs( + os.path.expanduser(remote_root) + ), "remote_root must be a abspath" self.temp_remote_root = remote_root self.remote_profile = remote_profile self.remote_root = None From be99e41de48ba6bc2961005447990c56a3f369b2 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Sun, 6 Oct 2024 21:56:56 +0900 Subject: [PATCH 42/76] Update ssh_context.py --- dpdispatcher/contexts/ssh_context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/contexts/ssh_context.py b/dpdispatcher/contexts/ssh_context.py index 8586c2fa..75314890 100644 --- a/dpdispatcher/contexts/ssh_context.py +++ b/dpdispatcher/contexts/ssh_context.py @@ -451,7 +451,7 @@ def __init__( self.init_remote_root = remote_root self.temp_local_root = os.path.abspath(local_root) assert os.path.isabs( - os.path.expanduser(remote_root) + os.path.realpath(remote_root) ), "remote_root must be a abspath" self.temp_remote_root = remote_root self.remote_profile = remote_profile From 415c114639cad5d2b610d75c2ea3b520b52d6823 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:07:51 +0900 Subject: [PATCH 43/76] allow retry_count=0 --- dpdispatcher/submission.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 28e38f4a..3c69b1be 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -844,8 +844,12 @@ def handle_unexpected_job_state(self): ) retry_count = 3 assert self.machine is not None - if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0: - retry_count = self.machine.retry_count + 1 + # if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0: + # retry_count = self.machine.retry_count + 1 + + if hasattr(self.machine, "retry_count"): + retry_count = self.machine.retry_count + if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): last_error_message = self.get_last_error_message() err_msg = ( From ac9c1e4be05c6c3e17402f51c7a507fc1a5e346b Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 16 Oct 2024 01:58:56 +0900 Subject: [PATCH 44/76] Update submission.py --- dpdispatcher/submission.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 3c69b1be..01976b9f 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -844,9 +844,6 @@ def handle_unexpected_job_state(self): ) retry_count = 3 assert self.machine is not None - # if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0: - # retry_count = self.machine.retry_count + 1 - if hasattr(self.machine, "retry_count"): retry_count = self.machine.retry_count From 1f29674e26eeaf1ce87c313e9628e3904e740077 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:49:01 +0900 Subject: [PATCH 45/76] Update submission.py --- dpdispatcher/submission.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 01976b9f..57d7b3c1 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -847,15 +847,18 @@ def handle_unexpected_job_state(self): if hasattr(self.machine, "retry_count"): retry_count = self.machine.retry_count - if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): - last_error_message = self.get_last_error_message() - err_msg = ( - f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." - ) - if last_error_message is not None: - err_msg += f"\nPossible remote error message: {last_error_message}" - raise RuntimeError(err_msg) - self.submit_job() + if (self.fail_count) > 0: + if self.fail_count % retry_count == 0: + last_error_message = self.get_last_error_message() + err_msg = f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." + if last_error_message is not None: + err_msg += ( + f"\nPossible remote error message: {last_error_message}" + ) + raise RuntimeError(err_msg) + else: + self.submit_job() + if self.job_state != JobStatus.unsubmitted: dlog.info( f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}" From ba5147d4dd466e7f87412c8262d42a6b49a049ff Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:13:42 +0900 Subject: [PATCH 46/76] Update submission.py --- dpdispatcher/submission.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 57d7b3c1..57ce2d0d 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -847,18 +847,16 @@ def handle_unexpected_job_state(self): if hasattr(self.machine, "retry_count"): retry_count = self.machine.retry_count - if (self.fail_count) > 0: - if self.fail_count % retry_count == 0: - last_error_message = self.get_last_error_message() - err_msg = f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." - if last_error_message is not None: - err_msg += ( - f"\nPossible remote error message: {last_error_message}" - ) - raise RuntimeError(err_msg) - else: - self.submit_job() + if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): + last_error_message = self.get_last_error_message() + err_msg = ( + f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." + ) + if last_error_message is not None: + err_msg += f"\nPossible remote error message: {last_error_message}" + raise RuntimeError(err_msg) + self.submit_job() if self.job_state != JobStatus.unsubmitted: dlog.info( f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}" @@ -874,8 +872,10 @@ def handle_unexpected_job_state(self): if job_state == JobStatus.unsubmitted: dlog.debug(f"job: {self.job_hash} unsubmitted; submit it") - # if self.fail_count > 3: - # raise RuntimeError("job:job {job} failed 3 times".format(job=self)) + if self.fail_count > retry_count: + raise RuntimeError( + f"job:job {self.job_hash} failed {self.fail_count} times, exceed retry_count {retry_count}" + ) self.submit_job() if self.job_state != JobStatus.unsubmitted: dlog.info(f"job: {self.job_hash} submit; job_id is {self.job_id}") From 7ee3919adff0ae94b5422c89e1c85d33a8eaf831 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:18:50 +0900 Subject: [PATCH 47/76] Update submission.py --- dpdispatcher/submission.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 57ce2d0d..4994f6a2 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -842,12 +842,12 @@ def handle_unexpected_job_state(self): f"job: {self.job_hash} {self.job_id} terminated; " f"fail_cout is {self.fail_count}; resubmitting job" ) - retry_count = 3 + self.retry_count = 3 assert self.machine is not None if hasattr(self.machine, "retry_count"): - retry_count = self.machine.retry_count + self.retry_count = self.machine.retry_count - if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): + if (self.fail_count) > 0 and (self.fail_count % self.retry_count == 0): last_error_message = self.get_last_error_message() err_msg = ( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." @@ -872,9 +872,9 @@ def handle_unexpected_job_state(self): if job_state == JobStatus.unsubmitted: dlog.debug(f"job: {self.job_hash} unsubmitted; submit it") - if self.fail_count > retry_count: + if self.fail_count > self.retry_count: raise RuntimeError( - f"job:job {self.job_hash} failed {self.fail_count} times, exceed retry_count {retry_count}" + f"job:job {self.job_hash} failed {self.fail_count} times, exceed retry_count {self.retry_count}" ) self.submit_job() if self.job_state != JobStatus.unsubmitted: From 2fb5e818ddfc81e189fcc8ced4c1432b8e5accdc Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:32:35 +0900 Subject: [PATCH 48/76] Update submission.py --- dpdispatcher/submission.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 4994f6a2..19be0144 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -842,20 +842,24 @@ def handle_unexpected_job_state(self): f"job: {self.job_hash} {self.job_id} terminated; " f"fail_cout is {self.fail_count}; resubmitting job" ) - self.retry_count = 3 + retry_count = 3 assert self.machine is not None if hasattr(self.machine, "retry_count"): - self.retry_count = self.machine.retry_count + retry_count = self.machine.retry_count - if (self.fail_count) > 0 and (self.fail_count % self.retry_count == 0): + print(f"retry_count: {retry_count}") + + if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): last_error_message = self.get_last_error_message() err_msg = ( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." ) + + print(f"last_error_message: {last_error_message}") + if last_error_message is not None: err_msg += f"\nPossible remote error message: {last_error_message}" raise RuntimeError(err_msg) - self.submit_job() if self.job_state != JobStatus.unsubmitted: dlog.info( @@ -872,10 +876,8 @@ def handle_unexpected_job_state(self): if job_state == JobStatus.unsubmitted: dlog.debug(f"job: {self.job_hash} unsubmitted; submit it") - if self.fail_count > self.retry_count: - raise RuntimeError( - f"job:job {self.job_hash} failed {self.fail_count} times, exceed retry_count {self.retry_count}" - ) + # if self.fail_count > 3: + # raise RuntimeError("job:job {job} failed 3 times".format(job=self)) self.submit_job() if self.job_state != JobStatus.unsubmitted: dlog.info(f"job: {self.job_hash} submit; job_id is {self.job_id}") From 033443cc2bffbe8c632cf0fe70cd576dccf2125b Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:52:52 +0900 Subject: [PATCH 49/76] Update submission.py --- dpdispatcher/submission.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 19be0144..a661c218 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -847,20 +847,17 @@ def handle_unexpected_job_state(self): if hasattr(self.machine, "retry_count"): retry_count = self.machine.retry_count - print(f"retry_count: {retry_count}") - if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): last_error_message = self.get_last_error_message() err_msg = ( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." ) - - print(f"last_error_message: {last_error_message}") - if last_error_message is not None: err_msg += f"\nPossible remote error message: {last_error_message}" raise RuntimeError(err_msg) - self.submit_job() + else: + self.submit_job() + if self.job_state != JobStatus.unsubmitted: dlog.info( f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}" From f977a7a469c3811320320c51ba8d275a646d7a92 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:56:09 +0900 Subject: [PATCH 50/76] Update submission.py --- dpdispatcher/submission.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index a661c218..e7be4b1e 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -842,12 +842,14 @@ def handle_unexpected_job_state(self): f"job: {self.job_hash} {self.job_id} terminated; " f"fail_cout is {self.fail_count}; resubmitting job" ) - retry_count = 3 + retry_count = 3 # Default retry count assert self.machine is not None if hasattr(self.machine, "retry_count"): retry_count = self.machine.retry_count - if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): + dlog.info(f"retry_count: {retry_count}") + + if self.fail_count > retry_count: last_error_message = self.get_last_error_message() err_msg = ( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." From 1787bcc0c6ef1cc6d37a0b084233e5c9fd41bfc3 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Thu, 31 Oct 2024 15:40:00 +0900 Subject: [PATCH 51/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 50ca3c5f..af3ca091 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -217,12 +217,13 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources job_name = resources.kwargs.get("job_name", "wDPjob") - pe_name = resources.kwargs.get("pe_name", "mpi") + pe_name = resources.kwargs.get("pe_name", None) sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - sge_script_header_dict["select_node_line"] += ( - f"#$ -pe {pe_name} {resources.cpu_per_node}\n" - ) + if pe_name is not None: + sge_script_header_dict["select_node_line"] += ( + f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + ) if resources.queue_name != "": sge_script_header_dict["select_node_line"] += ( From 6bd72ad38d4612fe9630cb8896c21c39cf7a861f Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Mon, 9 Dec 2024 00:32:10 +0900 Subject: [PATCH 52/76] Update shell.py --- dpdispatcher/machines/shell.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpdispatcher/machines/shell.py b/dpdispatcher/machines/shell.py index babb1971..b9beac11 100644 --- a/dpdispatcher/machines/shell.py +++ b/dpdispatcher/machines/shell.py @@ -1,3 +1,4 @@ +import os import shlex from dpdispatcher.dlog import dlog @@ -39,6 +40,9 @@ def do_submit(self, job): script_run_file_name = f"{job.script_file_name}.run" self.context.write_file(fname=script_run_file_name, write_str=script_run_str) cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!" + if os.name == "nt": + cmd = f"cd /d {self.context.remote_root} && start /b bash {script_file_name} >> {output_name} 2>&1" + ret, stdin, stdout, stderr = self.context.block_call(cmd) if ret != 0: err_str = stderr.read().decode("utf-8") From 57a599107b54345127241a645acc72c4a8d7851e Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:04:24 +0900 Subject: [PATCH 53/76] u --- dpdispatcher/machines/batch.py | 98 ++++++++++++++++++++++++++++++++++ dpdispatcher/machines/shell.py | 3 -- 2 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 dpdispatcher/machines/batch.py diff --git a/dpdispatcher/machines/batch.py b/dpdispatcher/machines/batch.py new file mode 100644 index 00000000..21cfca1a --- /dev/null +++ b/dpdispatcher/machines/batch.py @@ -0,0 +1,98 @@ +import os +import shlex +from subprocess import PIPE, Popen + +from dpdispatcher.dlog import dlog +from dpdispatcher.machine import Machine +from dpdispatcher.utils.job_status import JobStatus +from dpdispatcher.utils.utils import customized_script_header_template + +shell_script_header_template = """@echo off\n""" + + +class Batch(Machine): + def gen_script(self, job): + shell_script = super().gen_script(job) + return shell_script + + def gen_script_header(self, job): + resources = job.resources + if ( + resources["strategy"].get("customized_script_header_template_file") + is not None + ): + shell_script_header = customized_script_header_template( + resources["strategy"]["customized_script_header_template_file"], + resources, + ) + else: + shell_script_header = shell_script_header_template + return shell_script_header + + def do_submit(self, job): + script_str = self.gen_script(job) + script_file_name = job.script_file_name + job_id_name = job.job_hash + "_job_id" + output_name = job.job_hash + ".out" + self.context.write_file(fname=script_file_name, write_str=script_str) + script_run_str = self.gen_script_command(job) + script_run_file_name = f"{job.script_file_name}.run" + self.context.write_file(fname=script_run_file_name, write_str=script_run_str) + + cmd = f"start /B cmd /C {shlex.quote(script_file_name)} > {output_name} 2>&1 && echo %!PID!" + process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) + stdout, stderr = process.communicate() + + if process.returncode != 0: + err_str = stderr.decode("utf-8") + raise RuntimeError( + f"Failed to execute command: {cmd}\nError: {err_str}\nReturn code: {process.returncode}" + ) + + job_id = stdout.decode("utf-8").strip() + self.context.write_file(job_id_name, job_id) + return job_id + + def default_resources(self, resources): + pass + + def check_status(self, job): + job_id = job.job_id + + if not job_id: + return JobStatus.unsubmitted + + cmd = f'tasklist /FI "PID eq {job_id}"' + process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) + stdout, stderr = process.communicate() + + if process.returncode != 0: + err_str = stderr.decode("utf-8") + raise RuntimeError( + f"Failed to execute command: {cmd}\nError: {err_str}\nReturn code: {process.returncode}" + ) + + output = stdout.decode("utf-8") + if str(job_id) in output: + if self.check_finish_tag(job): + dlog.info(f"job: {job.job_hash} {job.job_id} finished") + return JobStatus.finished + return JobStatus.running + else: + return JobStatus.terminated + + def check_finish_tag(self, job): + job_tag_finished = job.job_hash + "_job_tag_finished" + return self.context.check_file_exists(job_tag_finished) + + def kill(self, job): + job_id = job.job_id + cmd = f"taskkill /PID {job_id} /F" + process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) + stdout, stderr = process.communicate() + + if process.returncode != 0: + err_str = stderr.decode("utf-8") + raise RuntimeError( + f"Failed to kill job {job_id}: {err_str}\nReturn code: {process.returncode}" + ) diff --git a/dpdispatcher/machines/shell.py b/dpdispatcher/machines/shell.py index b9beac11..601dd399 100644 --- a/dpdispatcher/machines/shell.py +++ b/dpdispatcher/machines/shell.py @@ -40,9 +40,6 @@ def do_submit(self, job): script_run_file_name = f"{job.script_file_name}.run" self.context.write_file(fname=script_run_file_name, write_str=script_run_str) cmd = f"cd {shlex.quote(self.context.remote_root)} && {{ nohup bash {script_file_name} 1>>{output_name} 2>>{output_name} & }} && echo $!" - if os.name == "nt": - cmd = f"cd /d {self.context.remote_root} && start /b bash {script_file_name} >> {output_name} 2>&1" - ret, stdin, stdout, stderr = self.context.block_call(cmd) if ret != 0: err_str = stderr.read().decode("utf-8") From 69b4a0bd672f980e456e8fedb6a6921cfa283f7c Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:32:35 +0900 Subject: [PATCH 54/76] Update batch.py --- dpdispatcher/machines/batch.py | 39 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/dpdispatcher/machines/batch.py b/dpdispatcher/machines/batch.py index 21cfca1a..9c5b0950 100644 --- a/dpdispatcher/machines/batch.py +++ b/dpdispatcher/machines/batch.py @@ -1,6 +1,6 @@ import os import shlex -from subprocess import PIPE, Popen +import subprocess from dpdispatcher.dlog import dlog from dpdispatcher.machine import Machine @@ -40,16 +40,21 @@ def do_submit(self, job): self.context.write_file(fname=script_run_file_name, write_str=script_run_str) cmd = f"start /B cmd /C {shlex.quote(script_file_name)} > {output_name} 2>&1 && echo %!PID!" - process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) - stdout, stderr = process.communicate() + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if process.returncode != 0: - err_str = stderr.decode("utf-8") + print(result.stdout) + + if result.returncode != 0: + raise RuntimeError( + f"Failed to execute command: {cmd}\nError: {result.stderr}\nReturn code: {result.returncode}" + ) + + job_id = result.stdout.strip() + if not job_id.isdigit(): raise RuntimeError( - f"Failed to execute command: {cmd}\nError: {err_str}\nReturn code: {process.returncode}" + f"Failed to retrieve job ID from output: {result.stdout}" ) - job_id = stdout.decode("utf-8").strip() self.context.write_file(job_id_name, job_id) return job_id @@ -63,19 +68,15 @@ def check_status(self, job): return JobStatus.unsubmitted cmd = f'tasklist /FI "PID eq {job_id}"' - process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) - stdout, stderr = process.communicate() + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if process.returncode != 0: - err_str = stderr.decode("utf-8") + if result.returncode != 0: raise RuntimeError( - f"Failed to execute command: {cmd}\nError: {err_str}\nReturn code: {process.returncode}" + f"Failed to execute command: {cmd}\nError: {result.stderr}\nReturn code: {result.returncode}" ) - output = stdout.decode("utf-8") - if str(job_id) in output: + if str(job_id) in result.stdout: if self.check_finish_tag(job): - dlog.info(f"job: {job.job_hash} {job.job_id} finished") return JobStatus.finished return JobStatus.running else: @@ -88,11 +89,9 @@ def check_finish_tag(self, job): def kill(self, job): job_id = job.job_id cmd = f"taskkill /PID {job_id} /F" - process = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) - stdout, stderr = process.communicate() + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if process.returncode != 0: - err_str = stderr.decode("utf-8") + if result.returncode != 0: raise RuntimeError( - f"Failed to kill job {job_id}: {err_str}\nReturn code: {process.returncode}" + f"Failed to kill job {job_id}: {result.stderr}\nReturn code: {result.returncode}" ) From e80b7128f158247a4b2fab2fbaa8e7eb57fbb480 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Mon, 9 Dec 2024 22:25:38 +0900 Subject: [PATCH 55/76] Update batch.py --- dpdispatcher/machines/batch.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/dpdispatcher/machines/batch.py b/dpdispatcher/machines/batch.py index 9c5b0950..20d23088 100644 --- a/dpdispatcher/machines/batch.py +++ b/dpdispatcher/machines/batch.py @@ -7,7 +7,7 @@ from dpdispatcher.utils.job_status import JobStatus from dpdispatcher.utils.utils import customized_script_header_template -shell_script_header_template = """@echo off\n""" +shell_script_header_template = """@echo off""" class Batch(Machine): @@ -38,22 +38,20 @@ def do_submit(self, job): script_run_str = self.gen_script_command(job) script_run_file_name = f"{job.script_file_name}.run" self.context.write_file(fname=script_run_file_name, write_str=script_run_str) + cmd = f"cd {shlex.quote(self.context.remote_root)} && {script_file_name} > {output_name} 2>&1" + ret, stdin, stdout, stderr = self.context.block_call(cmd) - cmd = f"start /B cmd /C {shlex.quote(script_file_name)} > {output_name} 2>&1 && echo %!PID!" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - - print(result.stdout) - - if result.returncode != 0: - raise RuntimeError( - f"Failed to execute command: {cmd}\nError: {result.stderr}\nReturn code: {result.returncode}" - ) + print("ret:", ret) + print("stdin:", stdin.read().decode("utf-8")) + print("stdout:", stdout.read().decode("utf-8")) + print("stderr:", stderr.read().decode("utf-8")) - job_id = result.stdout.strip() - if not job_id.isdigit(): + if ret != 0: + err_str = stderr.read().decode("utf-8") raise RuntimeError( - f"Failed to retrieve job ID from output: {result.stdout}" + f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n" ) + job_id = int(stdout.read().decode("utf-8").strip()) self.context.write_file(job_id_name, job_id) return job_id From 4bf6a7652310c87a6625b8590713bb63bf6763f3 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 8 Jan 2025 00:35:40 +0900 Subject: [PATCH 56/76] Update machine.py --- dpdispatcher/machine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index a78b61f5..1a22bbe7 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -20,7 +20,7 @@ script_env_template = """ REMOTE_ROOT=$(readlink -f {remote_root}) -echo 0 > $REMOTE_ROOT/{flag_if_job_task_fail} +echo 0 > "$REMOTE_ROOT/{flag_if_job_task_fail}" test $? -ne 0 && exit 1 {module_unload_part} From 170b1ef5070224d092841fad03b1378b523f7b39 Mon Sep 17 00:00:00 2001 From: Thang Nguyen <46436648+thangckt@users.noreply.github.com> Date: Wed, 8 Jan 2025 01:34:23 +0900 Subject: [PATCH 57/76] Update machine.py --- dpdispatcher/machine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 1a22bbe7..a78b61f5 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -20,7 +20,7 @@ script_env_template = """ REMOTE_ROOT=$(readlink -f {remote_root}) -echo 0 > "$REMOTE_ROOT/{flag_if_job_task_fail}" +echo 0 > $REMOTE_ROOT/{flag_if_job_task_fail} test $? -ne 0 && exit 1 {module_unload_part} From 284f1f8332710d136d58aeba9c249f44005175b2 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Wed, 27 Aug 2025 17:39:39 +0900 Subject: [PATCH 58/76] revise option to control number of resubmit fail jobs --- dpdispatcher/machine.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 01a51557..8810693a 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -82,6 +82,7 @@ def __init__( local_root=None, remote_root=None, remote_profile={}, + retry_count=3, *, context=None, ): @@ -396,6 +397,7 @@ def arginfo(cls): doc_clean_asynchronously = ( "Clean the remote directory asynchronously after the job finishes." ) + doc_retry_count = "Number of retries to submit failed jobs." machine_args = [ Argument("batch_type", str, optional=False, doc=doc_batch_type), @@ -413,6 +415,7 @@ def arginfo(cls): default=False, doc=doc_clean_asynchronously, ), + Argument("retry_count", int, optional=True, default=3, doc=doc_retry_count), ] context_variant = Variant( From 8b04937d2fde9d9c2c4d96f753f0dc86416e8e0c Mon Sep 17 00:00:00 2001 From: thangckt Date: Wed, 27 Aug 2025 19:53:08 +0900 Subject: [PATCH 59/76] revise resubmit fail jobs --- dpdispatcher/machine.py | 5 +++-- dpdispatcher/machines/dp_cloud_server.py | 2 +- dpdispatcher/machines/openapi.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 8810693a..a3354e78 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -82,9 +82,9 @@ def __init__( local_root=None, remote_root=None, remote_profile={}, - retry_count=3, *, context=None, + retry_count=3, ): if context is None: assert isinstance(self, self.__class__.subclasses_dict[batch_type]) @@ -97,6 +97,7 @@ def __init__( else: pass self.bind_context(context=context) + self.retry_count = retry_count def bind_context(self, context): self.context = context @@ -397,7 +398,7 @@ def arginfo(cls): doc_clean_asynchronously = ( "Clean the remote directory asynchronously after the job finishes." ) - doc_retry_count = "Number of retries to submit failed jobs." + doc_retry_count = "Number of retries to resubmit failed jobs." machine_args = [ Argument("batch_type", str, optional=False, doc=doc_batch_type), diff --git a/dpdispatcher/machines/dp_cloud_server.py b/dpdispatcher/machines/dp_cloud_server.py index b4719bfe..5082c351 100644 --- a/dpdispatcher/machines/dp_cloud_server.py +++ b/dpdispatcher/machines/dp_cloud_server.py @@ -32,7 +32,7 @@ def __init__(self, context): phone = context.remote_profile.get("phone", None) username = context.remote_profile.get("username", None) password = context.remote_profile.get("password", None) - self.retry_count = context.remote_profile.get("retry_count", 3) + # self.retry_count = context.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) ticket = os.environ.get("BOHR_TICKET", None) diff --git a/dpdispatcher/machines/openapi.py b/dpdispatcher/machines/openapi.py index e5514dce..114e9026 100644 --- a/dpdispatcher/machines/openapi.py +++ b/dpdispatcher/machines/openapi.py @@ -38,7 +38,7 @@ def __init__(self, context): self.remote_profile = context.remote_profile.copy() self.grouped = self.remote_profile.get("grouped", True) - self.retry_count = self.remote_profile.get("retry_count", 3) + # self.retry_count = self.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) access_key = ( From 909e5958d58c3eb2e49949573cc986715f523d3b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Aug 2025 11:01:29 +0000 Subject: [PATCH 60/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdispatcher/machines/batch.py | 2 -- dpdispatcher/machines/shell.py | 1 - 2 files changed, 3 deletions(-) diff --git a/dpdispatcher/machines/batch.py b/dpdispatcher/machines/batch.py index 20d23088..710317e9 100644 --- a/dpdispatcher/machines/batch.py +++ b/dpdispatcher/machines/batch.py @@ -1,8 +1,6 @@ -import os import shlex import subprocess -from dpdispatcher.dlog import dlog from dpdispatcher.machine import Machine from dpdispatcher.utils.job_status import JobStatus from dpdispatcher.utils.utils import customized_script_header_template diff --git a/dpdispatcher/machines/shell.py b/dpdispatcher/machines/shell.py index 3837be02..2205e333 100644 --- a/dpdispatcher/machines/shell.py +++ b/dpdispatcher/machines/shell.py @@ -1,4 +1,3 @@ -import os import shlex from dpdispatcher.dlog import dlog From 1ee843f0ea3adbed61dd817f9ebc11c7e2604a70 Mon Sep 17 00:00:00 2001 From: thangckt Date: Wed, 27 Aug 2025 20:12:00 +0900 Subject: [PATCH 61/76] u --- dpdispatcher/machines/batch.py | 93 ------------------- dpdispatcher/machines/dp_cloud_server.py | 2 +- dpdispatcher/machines/openapi.py | 2 +- dpdispatcher/machines/pbs.py | 9 +- dpdispatcher/submission.py | 15 +-- tests/test_context_dir/0_md/graph.pb | 0 tests/test_hdfs_dir/0_md/graph.pb | 0 tests/test_lsf_dir/0_md/graph.pb | 0 tests/test_pbs_dir/0_md/graph.pb | 0 .../test_shell_trival_dir/parent_dir/graph.pb | 0 tests/test_slurm_dir/0_md/graph.pb | 0 11 files changed, 11 insertions(+), 110 deletions(-) delete mode 100644 dpdispatcher/machines/batch.py mode change 120000 => 100755 tests/test_context_dir/0_md/graph.pb mode change 120000 => 100755 tests/test_hdfs_dir/0_md/graph.pb mode change 120000 => 100755 tests/test_lsf_dir/0_md/graph.pb mode change 120000 => 100755 tests/test_pbs_dir/0_md/graph.pb mode change 120000 => 100755 tests/test_shell_trival_dir/parent_dir/graph.pb mode change 120000 => 100755 tests/test_slurm_dir/0_md/graph.pb diff --git a/dpdispatcher/machines/batch.py b/dpdispatcher/machines/batch.py deleted file mode 100644 index 710317e9..00000000 --- a/dpdispatcher/machines/batch.py +++ /dev/null @@ -1,93 +0,0 @@ -import shlex -import subprocess - -from dpdispatcher.machine import Machine -from dpdispatcher.utils.job_status import JobStatus -from dpdispatcher.utils.utils import customized_script_header_template - -shell_script_header_template = """@echo off""" - - -class Batch(Machine): - def gen_script(self, job): - shell_script = super().gen_script(job) - return shell_script - - def gen_script_header(self, job): - resources = job.resources - if ( - resources["strategy"].get("customized_script_header_template_file") - is not None - ): - shell_script_header = customized_script_header_template( - resources["strategy"]["customized_script_header_template_file"], - resources, - ) - else: - shell_script_header = shell_script_header_template - return shell_script_header - - def do_submit(self, job): - script_str = self.gen_script(job) - script_file_name = job.script_file_name - job_id_name = job.job_hash + "_job_id" - output_name = job.job_hash + ".out" - self.context.write_file(fname=script_file_name, write_str=script_str) - script_run_str = self.gen_script_command(job) - script_run_file_name = f"{job.script_file_name}.run" - self.context.write_file(fname=script_run_file_name, write_str=script_run_str) - cmd = f"cd {shlex.quote(self.context.remote_root)} && {script_file_name} > {output_name} 2>&1" - ret, stdin, stdout, stderr = self.context.block_call(cmd) - - print("ret:", ret) - print("stdin:", stdin.read().decode("utf-8")) - print("stdout:", stdout.read().decode("utf-8")) - print("stderr:", stderr.read().decode("utf-8")) - - if ret != 0: - err_str = stderr.read().decode("utf-8") - raise RuntimeError( - f"status command {cmd} fails to execute\nerror message:{err_str}\nreturn code {ret}\n" - ) - job_id = int(stdout.read().decode("utf-8").strip()) - - self.context.write_file(job_id_name, job_id) - return job_id - - def default_resources(self, resources): - pass - - def check_status(self, job): - job_id = job.job_id - - if not job_id: - return JobStatus.unsubmitted - - cmd = f'tasklist /FI "PID eq {job_id}"' - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - - if result.returncode != 0: - raise RuntimeError( - f"Failed to execute command: {cmd}\nError: {result.stderr}\nReturn code: {result.returncode}" - ) - - if str(job_id) in result.stdout: - if self.check_finish_tag(job): - return JobStatus.finished - return JobStatus.running - else: - return JobStatus.terminated - - def check_finish_tag(self, job): - job_tag_finished = job.job_hash + "_job_tag_finished" - return self.context.check_file_exists(job_tag_finished) - - def kill(self, job): - job_id = job.job_id - cmd = f"taskkill /PID {job_id} /F" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - - if result.returncode != 0: - raise RuntimeError( - f"Failed to kill job {job_id}: {result.stderr}\nReturn code: {result.returncode}" - ) diff --git a/dpdispatcher/machines/dp_cloud_server.py b/dpdispatcher/machines/dp_cloud_server.py index 5082c351..b4719bfe 100644 --- a/dpdispatcher/machines/dp_cloud_server.py +++ b/dpdispatcher/machines/dp_cloud_server.py @@ -32,7 +32,7 @@ def __init__(self, context): phone = context.remote_profile.get("phone", None) username = context.remote_profile.get("username", None) password = context.remote_profile.get("password", None) - # self.retry_count = context.remote_profile.get("retry_count", 3) + self.retry_count = context.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) ticket = os.environ.get("BOHR_TICKET", None) diff --git a/dpdispatcher/machines/openapi.py b/dpdispatcher/machines/openapi.py index 114e9026..e5514dce 100644 --- a/dpdispatcher/machines/openapi.py +++ b/dpdispatcher/machines/openapi.py @@ -38,7 +38,7 @@ def __init__(self, context): self.remote_profile = context.remote_profile.copy() self.grouped = self.remote_profile.get("grouped", True) - # self.retry_count = self.remote_profile.get("retry_count", 3) + self.retry_count = self.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) access_key = ( diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index ee416874..35ef4c44 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -212,13 +212,12 @@ def gen_script_header(self, job): # resources.number_node is not used in SGE resources = job.resources job_name = resources.kwargs.get("job_name", "wDPjob") - pe_name = resources.kwargs.get("pe_name", None) + pe_name = resources.kwargs.get("pe_name", "mpi") sge_script_header_dict = {} sge_script_header_dict["select_node_line"] = f"#$ -N {job_name}\n" - if pe_name is not None: - sge_script_header_dict["select_node_line"] += ( - f"#$ -pe {pe_name} {resources.cpu_per_node}\n" - ) + sge_script_header_dict["select_node_line"] += ( + f"#$ -pe {pe_name} {resources.cpu_per_node}\n" + ) if resources.queue_name != "": sge_script_header_dict["select_node_line"] += ( diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index 103a1863..59376430 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -841,14 +841,11 @@ def handle_unexpected_job_state(self): f"job: {self.job_hash} {self.job_id} terminated; " f"fail_cout is {self.fail_count}; resubmitting job" ) - retry_count = 3 # Default retry count + retry_count = 3 assert self.machine is not None - if hasattr(self.machine, "retry_count"): - retry_count = self.machine.retry_count - - dlog.info(f"retry_count: {retry_count}") - - if self.fail_count > retry_count: + if hasattr(self.machine, "retry_count") and self.machine.retry_count >= 0: + retry_count = self.machine.retry_count + 1 + if (self.fail_count) > 0 and (self.fail_count % retry_count == 0): last_error_message = self.get_last_error_message() err_msg = ( f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times." @@ -856,9 +853,7 @@ def handle_unexpected_job_state(self): if last_error_message is not None: err_msg += f"\nPossible remote error message: {last_error_message}" raise RuntimeError(err_msg) - else: - self.submit_job() - + self.submit_job() if self.job_state != JobStatus.unsubmitted: dlog.info( f"job:{self.job_hash} re-submit after terminated; new job_id is {self.job_id}" diff --git a/tests/test_context_dir/0_md/graph.pb b/tests/test_context_dir/0_md/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_context_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_context_dir/0_md/graph.pb b/tests/test_context_dir/0_md/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_context_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_hdfs_dir/0_md/graph.pb b/tests/test_hdfs_dir/0_md/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_hdfs_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_hdfs_dir/0_md/graph.pb b/tests/test_hdfs_dir/0_md/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_hdfs_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_lsf_dir/0_md/graph.pb b/tests/test_lsf_dir/0_md/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_lsf_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_lsf_dir/0_md/graph.pb b/tests/test_lsf_dir/0_md/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_lsf_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_pbs_dir/0_md/graph.pb b/tests/test_pbs_dir/0_md/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_pbs_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_pbs_dir/0_md/graph.pb b/tests/test_pbs_dir/0_md/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_pbs_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_shell_trival_dir/parent_dir/graph.pb b/tests/test_shell_trival_dir/parent_dir/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_shell_trival_dir/parent_dir/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_shell_trival_dir/parent_dir/graph.pb b/tests/test_shell_trival_dir/parent_dir/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_shell_trival_dir/parent_dir/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_slurm_dir/0_md/graph.pb b/tests/test_slurm_dir/0_md/graph.pb deleted file mode 120000 index 9e112b9d..00000000 --- a/tests/test_slurm_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_slurm_dir/0_md/graph.pb b/tests/test_slurm_dir/0_md/graph.pb new file mode 100755 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_slurm_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file From 6b244da4a6ebe977a8c264db92eda1c8c3dd896f Mon Sep 17 00:00:00 2001 From: thangckt Date: Wed, 27 Aug 2025 20:15:02 +0900 Subject: [PATCH 62/76] u --- dpdispatcher/machines/dp_cloud_server.py | 2 +- dpdispatcher/machines/openapi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/machines/dp_cloud_server.py b/dpdispatcher/machines/dp_cloud_server.py index b4719bfe..5082c351 100644 --- a/dpdispatcher/machines/dp_cloud_server.py +++ b/dpdispatcher/machines/dp_cloud_server.py @@ -32,7 +32,7 @@ def __init__(self, context): phone = context.remote_profile.get("phone", None) username = context.remote_profile.get("username", None) password = context.remote_profile.get("password", None) - self.retry_count = context.remote_profile.get("retry_count", 3) + # self.retry_count = context.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) ticket = os.environ.get("BOHR_TICKET", None) diff --git a/dpdispatcher/machines/openapi.py b/dpdispatcher/machines/openapi.py index e5514dce..114e9026 100644 --- a/dpdispatcher/machines/openapi.py +++ b/dpdispatcher/machines/openapi.py @@ -38,7 +38,7 @@ def __init__(self, context): self.remote_profile = context.remote_profile.copy() self.grouped = self.remote_profile.get("grouped", True) - self.retry_count = self.remote_profile.get("retry_count", 3) + # self.retry_count = self.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) access_key = ( From 4bb42d8de7e1a6e3760d24bbfb50d99066a934d1 Mon Sep 17 00:00:00 2001 From: thangckt Date: Wed, 27 Aug 2025 20:23:21 +0900 Subject: [PATCH 63/76] u --- tests/test_context_dir/0_md/graph.pb | 0 tests/test_hdfs_dir/0_md/graph.pb | 0 tests/test_lsf_dir/0_md/graph.pb | 0 tests/test_pbs_dir/0_md/graph.pb | 0 tests/test_shell_trival_dir/parent_dir/graph.pb | 0 tests/test_slurm_dir/0_md/graph.pb | 0 6 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 120000 tests/test_context_dir/0_md/graph.pb mode change 100755 => 120000 tests/test_hdfs_dir/0_md/graph.pb mode change 100755 => 120000 tests/test_lsf_dir/0_md/graph.pb mode change 100755 => 120000 tests/test_pbs_dir/0_md/graph.pb mode change 100755 => 120000 tests/test_shell_trival_dir/parent_dir/graph.pb mode change 100755 => 120000 tests/test_slurm_dir/0_md/graph.pb diff --git a/tests/test_context_dir/0_md/graph.pb b/tests/test_context_dir/0_md/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_context_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_context_dir/0_md/graph.pb b/tests/test_context_dir/0_md/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_context_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_hdfs_dir/0_md/graph.pb b/tests/test_hdfs_dir/0_md/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_hdfs_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_hdfs_dir/0_md/graph.pb b/tests/test_hdfs_dir/0_md/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_hdfs_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_lsf_dir/0_md/graph.pb b/tests/test_lsf_dir/0_md/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_lsf_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_lsf_dir/0_md/graph.pb b/tests/test_lsf_dir/0_md/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_lsf_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_pbs_dir/0_md/graph.pb b/tests/test_pbs_dir/0_md/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_pbs_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_pbs_dir/0_md/graph.pb b/tests/test_pbs_dir/0_md/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_pbs_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_shell_trival_dir/parent_dir/graph.pb b/tests/test_shell_trival_dir/parent_dir/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_shell_trival_dir/parent_dir/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_shell_trival_dir/parent_dir/graph.pb b/tests/test_shell_trival_dir/parent_dir/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_shell_trival_dir/parent_dir/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file diff --git a/tests/test_slurm_dir/0_md/graph.pb b/tests/test_slurm_dir/0_md/graph.pb deleted file mode 100755 index 9e112b9d..00000000 --- a/tests/test_slurm_dir/0_md/graph.pb +++ /dev/null @@ -1 +0,0 @@ -../../graph.pb \ No newline at end of file diff --git a/tests/test_slurm_dir/0_md/graph.pb b/tests/test_slurm_dir/0_md/graph.pb new file mode 120000 index 00000000..9e112b9d --- /dev/null +++ b/tests/test_slurm_dir/0_md/graph.pb @@ -0,0 +1 @@ +../../graph.pb \ No newline at end of file From e897d8a3a7cfbdbcc64e83c11db8048bf102bbf4 Mon Sep 17 00:00:00 2001 From: thangckt Date: Wed, 27 Aug 2025 20:30:08 +0900 Subject: [PATCH 64/76] Update test_argcheck.py --- tests/test_argcheck.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_argcheck.py b/tests/test_argcheck.py index b87f39fc..637c5254 100644 --- a/tests/test_argcheck.py +++ b/tests/test_argcheck.py @@ -27,6 +27,7 @@ def test_machine_argcheck(self): "symlink": True, }, "clean_asynchronously": False, + "retry_count": 3, } self.assertDictEqual(norm_dict, expected_dict) From ae8a0a33bca50cfea1cc83c586de83bd7adbdd0a Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:05:14 +0900 Subject: [PATCH 65/76] u --- .gitignore | 1 + .vscode/settings.json | 3 +++ dpdispatcher/machine.py | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index 9f9928d8..07bec449 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ pydispatcher.egg-info .coverage* dbconfig.json .vscode/* +!.vscode/settings.json .idea */_version.py */_date.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..8f051979 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "ruff.lineLength": 88, +} \ No newline at end of file diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index a3354e78..0e1fc362 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -150,7 +150,8 @@ def load_from_dict(cls, machine_dict): base.check_value(machine_dict, strict=False) context = BaseContext.load_from_dict(machine_dict) - machine = machine_class(context=context) + retry_count = machine_dict.get("retry_count", 3) + machine = machine_class(context=context, retry_count=retry_count) return machine def serialize(self, if_empty_remote_profile=False): From 0d6bf3ded39a5c22a547642bade384c153fec71e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Aug 2025 07:05:29 +0000 Subject: [PATCH 66/76] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8f051979..6b01b5e5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { "ruff.lineLength": 88, -} \ No newline at end of file +} From 1d0d467a509c02ffa5005ac2c87da5ce9dbd5f47 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:10:05 +0900 Subject: [PATCH 67/76] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 07bec449..9f9928d8 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,6 @@ pydispatcher.egg-info .coverage* dbconfig.json .vscode/* -!.vscode/settings.json .idea */_version.py */_date.py From ac7116dd7cf1124749a823cc353d0944cb2d4390 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:12:29 +0900 Subject: [PATCH 68/76] u --- .gitignore | 1 + .vscode/settings.json | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9f9928d8..07bec449 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ pydispatcher.egg-info .coverage* dbconfig.json .vscode/* +!.vscode/settings.json .idea */_version.py */_date.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 6b01b5e5..44e7fd69 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "ruff.lineLength": 88, + "ruff.lineLength": 88 } From 558e9118e16bd4ad47e472bdfaaa6735f90d9823 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:21:21 +0900 Subject: [PATCH 69/76] Update test_class_machine_dispatch.py --- tests/test_class_machine_dispatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_class_machine_dispatch.py b/tests/test_class_machine_dispatch.py index db912a31..caeac1f5 100644 --- a/tests/test_class_machine_dispatch.py +++ b/tests/test_class_machine_dispatch.py @@ -189,6 +189,7 @@ def test_lebesgue(self): "context_type": "LebesgueContext", "local_root": "./", "remote_root": "./", + "retry_count": 3, "remote_profile": { "email": "114@514.com", "password": "114514", From c8b0198de397585c586570831d2a78843b788307 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:30:09 +0900 Subject: [PATCH 70/76] u --- dpdispatcher/machine.py | 7 +++---- tests/test_class_machine_dispatch.py | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 0e1fc362..ce0fae00 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -82,9 +82,9 @@ def __init__( local_root=None, remote_root=None, remote_profile={}, + retry_count=3, *, context=None, - retry_count=3, ): if context is None: assert isinstance(self, self.__class__.subclasses_dict[batch_type]) @@ -93,11 +93,11 @@ def __init__( local_root=local_root, remote_root=remote_root, remote_profile=remote_profile, + retry_count=retry_count, ) else: pass self.bind_context(context=context) - self.retry_count = retry_count def bind_context(self, context): self.context = context @@ -150,8 +150,7 @@ def load_from_dict(cls, machine_dict): base.check_value(machine_dict, strict=False) context = BaseContext.load_from_dict(machine_dict) - retry_count = machine_dict.get("retry_count", 3) - machine = machine_class(context=context, retry_count=retry_count) + machine = machine_class(context=context) return machine def serialize(self, if_empty_remote_profile=False): diff --git a/tests/test_class_machine_dispatch.py b/tests/test_class_machine_dispatch.py index caeac1f5..db912a31 100644 --- a/tests/test_class_machine_dispatch.py +++ b/tests/test_class_machine_dispatch.py @@ -189,7 +189,6 @@ def test_lebesgue(self): "context_type": "LebesgueContext", "local_root": "./", "remote_root": "./", - "retry_count": 3, "remote_profile": { "email": "114@514.com", "password": "114514", From 29604438fc1db5eab58395feacdfdf2cc357dd61 Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:38:03 +0900 Subject: [PATCH 71/76] Revert "u" This reverts commit c8b0198de397585c586570831d2a78843b788307. --- dpdispatcher/machine.py | 7 ++++--- tests/test_class_machine_dispatch.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index ce0fae00..0e1fc362 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -82,9 +82,9 @@ def __init__( local_root=None, remote_root=None, remote_profile={}, - retry_count=3, *, context=None, + retry_count=3, ): if context is None: assert isinstance(self, self.__class__.subclasses_dict[batch_type]) @@ -93,11 +93,11 @@ def __init__( local_root=local_root, remote_root=remote_root, remote_profile=remote_profile, - retry_count=retry_count, ) else: pass self.bind_context(context=context) + self.retry_count = retry_count def bind_context(self, context): self.context = context @@ -150,7 +150,8 @@ def load_from_dict(cls, machine_dict): base.check_value(machine_dict, strict=False) context = BaseContext.load_from_dict(machine_dict) - machine = machine_class(context=context) + retry_count = machine_dict.get("retry_count", 3) + machine = machine_class(context=context, retry_count=retry_count) return machine def serialize(self, if_empty_remote_profile=False): diff --git a/tests/test_class_machine_dispatch.py b/tests/test_class_machine_dispatch.py index db912a31..caeac1f5 100644 --- a/tests/test_class_machine_dispatch.py +++ b/tests/test_class_machine_dispatch.py @@ -189,6 +189,7 @@ def test_lebesgue(self): "context_type": "LebesgueContext", "local_root": "./", "remote_root": "./", + "retry_count": 3, "remote_profile": { "email": "114@514.com", "password": "114514", From 6682e061af6eb91dfa542c40852144d7a013cede Mon Sep 17 00:00:00 2001 From: "C. Thang Nguyen" Date: Fri, 29 Aug 2025 16:44:54 +0900 Subject: [PATCH 72/76] Update test_class_machine_dispatch.py --- tests/test_class_machine_dispatch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_class_machine_dispatch.py b/tests/test_class_machine_dispatch.py index caeac1f5..db912a31 100644 --- a/tests/test_class_machine_dispatch.py +++ b/tests/test_class_machine_dispatch.py @@ -189,7 +189,6 @@ def test_lebesgue(self): "context_type": "LebesgueContext", "local_root": "./", "remote_root": "./", - "retry_count": 3, "remote_profile": { "email": "114@514.com", "password": "114514", From 26a3732cd9d6b863c0b0a47f4f2867c6fdc2cbe7 Mon Sep 17 00:00:00 2001 From: thangckt Date: Fri, 29 Aug 2025 19:35:46 +0900 Subject: [PATCH 73/76] Update machine.py --- dpdispatcher/machine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 0e1fc362..9dc54fdc 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -164,6 +164,7 @@ def serialize(self, if_empty_remote_profile=False): machine_dict["remote_profile"] = self.context.remote_profile else: machine_dict["remote_profile"] = {} + machine_dict["retry_count"] = self.retry_count # normalize the dict base = self.arginfo() machine_dict = base.normalize_value(machine_dict, trim_pattern="_*") From 77ce083e22c208d1f904f2775a43da8c746b4ef5 Mon Sep 17 00:00:00 2001 From: thangckt Date: Fri, 29 Aug 2025 19:48:12 +0900 Subject: [PATCH 74/76] u --- dpdispatcher/machines/dp_cloud_server.py | 2 +- dpdispatcher/machines/openapi.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dpdispatcher/machines/dp_cloud_server.py b/dpdispatcher/machines/dp_cloud_server.py index 5082c351..5f8143bc 100644 --- a/dpdispatcher/machines/dp_cloud_server.py +++ b/dpdispatcher/machines/dp_cloud_server.py @@ -20,6 +20,7 @@ class Bohrium(Machine): alias = ("Lebesgue", "DpCloudServer") def __init__(self, context): + super().__init__(context) self.context = context self.input_data = context.remote_profile["input_data"].copy() self.api_version = 2 @@ -32,7 +33,6 @@ def __init__(self, context): phone = context.remote_profile.get("phone", None) username = context.remote_profile.get("username", None) password = context.remote_profile.get("password", None) - # self.retry_count = context.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) ticket = os.environ.get("BOHR_TICKET", None) diff --git a/dpdispatcher/machines/openapi.py b/dpdispatcher/machines/openapi.py index 114e9026..aec1f3d3 100644 --- a/dpdispatcher/machines/openapi.py +++ b/dpdispatcher/machines/openapi.py @@ -30,6 +30,7 @@ def unzip_file(zip_file, out_dir="./"): class OpenAPI(Machine): def __init__(self, context): + super().__init__(context) if not found_bohriumsdk: raise ModuleNotFoundError( "bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`" @@ -38,7 +39,6 @@ def __init__(self, context): self.remote_profile = context.remote_profile.copy() self.grouped = self.remote_profile.get("grouped", True) - # self.retry_count = self.remote_profile.get("retry_count", 3) self.ignore_exit_code = context.remote_profile.get("ignore_exit_code", True) access_key = ( From ecc93b226c3a2571f1d4b0014886901222249f71 Mon Sep 17 00:00:00 2001 From: thangckt Date: Fri, 29 Aug 2025 20:02:08 +0900 Subject: [PATCH 75/76] u --- dpdispatcher/machine.py | 2 +- dpdispatcher/machines/dp_cloud_server.py | 4 ++-- dpdispatcher/machines/openapi.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpdispatcher/machine.py b/dpdispatcher/machine.py index 9dc54fdc..d6dbf785 100644 --- a/dpdispatcher/machine.py +++ b/dpdispatcher/machine.py @@ -82,9 +82,9 @@ def __init__( local_root=None, remote_root=None, remote_profile={}, + retry_count=3, *, context=None, - retry_count=3, ): if context is None: assert isinstance(self, self.__class__.subclasses_dict[batch_type]) diff --git a/dpdispatcher/machines/dp_cloud_server.py b/dpdispatcher/machines/dp_cloud_server.py index 5f8143bc..001a17fe 100644 --- a/dpdispatcher/machines/dp_cloud_server.py +++ b/dpdispatcher/machines/dp_cloud_server.py @@ -19,8 +19,8 @@ class Bohrium(Machine): alias = ("Lebesgue", "DpCloudServer") - def __init__(self, context): - super().__init__(context) + def __init__(self, context, **kwargs): + super().__init__(context=context, **kwargs) self.context = context self.input_data = context.remote_profile["input_data"].copy() self.api_version = 2 diff --git a/dpdispatcher/machines/openapi.py b/dpdispatcher/machines/openapi.py index aec1f3d3..64c57c4c 100644 --- a/dpdispatcher/machines/openapi.py +++ b/dpdispatcher/machines/openapi.py @@ -29,8 +29,8 @@ def unzip_file(zip_file, out_dir="./"): class OpenAPI(Machine): - def __init__(self, context): - super().__init__(context) + def __init__(self, context, **kwargs): + super().__init__(context=context, **kwargs) if not found_bohriumsdk: raise ModuleNotFoundError( "bohriumsdk not installed. Install dpdispatcher with `pip install dpdispatcher[bohrium]`" From bfe43566911134a7f43ee10e093f8cb39e702f64 Mon Sep 17 00:00:00 2001 From: thangckt Date: Sat, 30 Aug 2025 00:42:08 +0900 Subject: [PATCH 76/76] Update pbs.py --- dpdispatcher/machines/pbs.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/dpdispatcher/machines/pbs.py b/dpdispatcher/machines/pbs.py index 35ef4c44..9942cb89 100644 --- a/dpdispatcher/machines/pbs.py +++ b/dpdispatcher/machines/pbs.py @@ -17,6 +17,9 @@ class PBS(Machine): + def __init__(self, **kwargs): + super().__init__(**kwargs) + def gen_script(self, job): pbs_script = super().gen_script(job) return pbs_script @@ -188,24 +191,8 @@ def gen_script_header(self, job): class SGE(PBS): - def __init__( - self, - batch_type=None, - context_type=None, - local_root=None, - remote_root=None, - remote_profile={}, - *, - context=None, - ): - super(PBS, self).__init__( - batch_type, - context_type, - local_root, - remote_root, - remote_profile, - context=context, - ) + def __init__(self, **kwargs): + super().__init__(**kwargs) def gen_script_header(self, job): ### Ref:https://softpanorama.org/HPC/PBS_and_derivatives/Reference/pbs_command_vs_sge_commands.shtml