galaxyproject · pvanheus · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/data_managers/data_manager_nextclade/.shed.yml b/data_managers/data_manager_nextclade/.shed.yml
@@ -7,4 +7,5 @@ long_description: |
 name: data_manager_nextclade
 owner: iuc
 remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_nextclade
+homepage_url: https://github.com/nextstrain/nextclade
 type: unrestricted
diff --git a/data_managers/data_manager_nextclade/data_manager/generated_macros.xml b/data_managers/data_manager_nextclade/data_manager/generated_macros.xml
@@ -0,0 +1 @@
+../../../tools/nextclade/generated_macros.xml
diff --git a/data_managers/data_manager_nextclade/data_manager/nextclade_dm.py b/data_managers/data_manager_nextclade/data_manager/nextclade_dm.py
@@ -10,6 +10,14 @@
 from typing import List
 
 
+def get_nextclade_version() -> str:
+    # Returns the version of the nextclade CLI tool.
+    version_cmd = ["nextclade", "--version"]
+    version_proc = subprocess.run(version_cmd, capture_output=True, check=True)
+    version_output = version_proc.stdout.decode("utf-8").strip()
+    return version_output.split()[1]  # e.g., "nextclade 2.3.0"
+
+
 def parse_date(d: str) -> datetime.datetime:
     # Parses the publication date from the nextclade release tags or user input into a datetime object.
     date = None
@@ -20,46 +28,57 @@ def parse_date(d: str) -> datetime.datetime:
     return date
 
 
-def entry_to_tag(entry: dict) -> str:
-    return (
-        entry["attributes"]["name"]["value"] + "_" + entry["attributes"]["tag"]["value"]
-    )
+def entry_to_tag(name: str, tag: str) -> str:
+    return "_".join([name, tag])
 
 
-def get_database_list() -> List[dict]:
-    list_cmd = [
-        "nextclade",
-        "dataset",
-        "list",
-        "--json",
-        "--include-old",
-        "--include-incompatible",
-    ]
+def get_database_list(default_compatibility: str) -> List[dict]:
+    list_cmd = ["nextclade", "dataset", "list", "--json"]
     list_proc = subprocess.run(list_cmd, capture_output=True, check=True)
     database_list = json.loads(list_proc.stdout)
     entry_list = []
     for db_entry in database_list:
         attributes = db_entry["attributes"]
-        entry = {
-            "value": entry_to_tag(db_entry),
-            "database_name": attributes["name"]["value"],
-            "description": attributes["name"]["valueFriendly"],
-            "date": datetime.datetime.fromisoformat(
-                attributes["tag"]["value"].replace("Z", "")
-            ),
-            "tag": attributes["tag"]["value"],
-            "min_nextclade_version": db_entry["compatibility"]["nextcladeCli"]["min"],
-        }
-        entry_list.append(entry)
+        name = db_entry["path"]
+        if "shortcuts" in db_entry:
+            name = db_entry["shortcuts"][0]
+        description = attributes["name"]
+        if (
+            "CY121680" in db_entry["path"] and description == "Influenza A H1N1pdm HA"
+        ) or ("CY163680" in db_entry["path"] and description == "Influenza A H3N2 HA"):
+            description += " (broad)"
+        if name.startswith("community/"):
+            description += " (community contributed)"
+
+        for version in db_entry["versions"]:
+            version_date = datetime.datetime.fromisoformat(
+                version["updatedAt"].replace("Z", "")
+            )
+            tag = version["tag"]
+            if "compatibility" not in version:
+                version["compatibility"] = {
+                    "cli": default_compatibility,
+                    "web": default_compatibility,
+                }
+            entry = {
+                "value": entry_to_tag(name, tag),
+                "database_name": name,
+                "description": description,
+                "date": version_date,
+                "tag": version["tag"],
+                "min_nextclade_version": version["compatibility"]["cli"],
+            }
+            entry_list.append(entry)
+
     return entry_list
 
 
 def filter_by_date(
-    existing_release_tags: List[str],
+    existing_release_tags: set[str],
     name: str,
     releases: list,
-    start_date: datetime.datetime = None,
-    end_date: datetime.datetime = None,
+    start_date: datetime.datetime | None = None,
+    end_date: datetime.datetime | None = None,
 ) -> List[dict]:
     ret = []
     for release in releases:
@@ -108,6 +127,7 @@ def comma_split(args: str) -> List[str]:
     parser.add_argument("--end_date", type=parse_date)
     parser.add_argument("--known_revisions", type=comma_split)
     parser.add_argument("--datasets", type=comma_split, default=["sars-cov-2"])
+
     parser.add_argument("datatable_name", default="nextclade")
     parser.add_argument("galaxy_config")
     args = parser.parse_args()
@@ -118,13 +138,16 @@ def comma_split(args: str) -> List[str]:
     else:
         existing_release_tags = set()
 
-    releases_available = get_database_list()
+    nextclade_version = get_nextclade_version()
+    major_version = nextclade_version.split(".")[0]
+    default_compatibility = f"{major_version}.0.0"
+    releases_available = get_database_list(default_compatibility)
     if args.testmode:
         releases = []
         for name in args.datasets:
             releases.extend(
                 filter_by_date(
-                    [],
+                    set(),
                     name,
                     releases_available,
                     start_date=args.start_date,

diff --git a/data_managers/data_manager_nextclade/data_manager/nextclade_dm.xml b/data_managers/data_manager_nextclade/data_manager/nextclade_dm.xml
@@ -1,8 +1,10 @@
-<tool id="data_manager_nextclade" name="nextclade data manager" version="0.0.1+galaxy0" tool_type="manage_data" profile="20.01">
-
+<tool id="data_manager_nextclade" name="nextclade data manager" version="0.0.2+galaxy0" tool_type="manage_data" profile="24.0">
+    <macros>
+        <import>generated_macros.xml</import>
+    </macros>
     <requirements>
-        <requirement type="package" version="3.8">python</requirement>
-        <requirement type="package" version="2.3.0">nextclade</requirement>
+        <requirement type="package" version="3.12">python</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">nextclade</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
     #set $data_table = $__app__.tool_data_tables.get('nextclade')
@@ -38,15 +40,7 @@
     ]]></command>
     <inputs>
         <param name="datasets" type="select" label="Select nextclade datasets" multiple="true">
-            <option value="sars-cov-2" selected="true">SARS-CoV-2</option>
-            <option value="MPXV">Monkeypox (All Clades)</option>
-            <option value="hMPXV">Human Monkeypox (hMPXV)</option>
-            <option value="hMPXV_B1">Human Monkeypox Clade B.1</option>
-            <option value="flu_h1n1pdm_ha">Influenza A H1N1pdm HA</option>
-            <option value="flu_h3n2_ha">Influenza A H3N2 HA</option>
-            <option value="flu_vic_ha">Influenza B Victoria HA</option>
-            <option value="flu_yam_ha">Influenza B Yamagata HA</option>
-            <option value="sars-cov-2-no-recomb">SARS-CoV-2 without recombinants</option>
+            <expand macro="dataset_selector" />
         </param>
         <param name="additional_datasets" type="text" label="Additional nextclade dataset names" help="If you want to download datasets that are not in the list above, enter their names here, separated by commas">
             <validator type="regex" message="Dataset names consist of letters, numbers, underscore and hyphens, with multiple names separated by ,">^[-A-Za-z0-9_]?[-A-Za-z0-9_,]*$</validator>
@@ -72,22 +66,24 @@
         <data name="output_file" format="data_manager_json"/>
     </outputs>
     <tests>
+        <!-- test1 -->
         <test expect_num_outputs="1">
             <conditional name="release">
                 <param name="which" value="date_range" />
-                <param name="start_date" value="2022-03-01" />
-                <param name="end_date" value="2022-04-01" />
+                <param name="start_date" value="2025-03-04" />
+                <param name="end_date" value="2025-05-06" />
             </conditional>
             <output name="output_file">
                 <assert_contents>
                     <has_text text='"database_name": "sars-cov-2"' />
-                    <has_text text='sars-cov-2_2022-03-31T12-00-00Z' />
-                    <has_text text='sars-cov-2_2022-03-24T12-00-00Z' />
-                    <has_text text='sars-cov-2_2022-03-14T12-00-00Z"' />
-                    <has_text text='"min_nextclade_version": "1.10.0"' />
+                    <has_text text='sars-cov-2_2025-03-26--11-47-13Z' />
+                    <has_text text='sars-cov-2_2025-04-01--08-20-12Z' />
+                    <has_text text='sars-cov-2_2025-05-05--13-15-29Z' />
+                    <has_text text='"min_nextclade_version": "3.0.0' />
                 </assert_contents>
             </output>
         </test>
+        <!-- test2 -->
         <test expect_num_outputs="1">
             <param name="datasets" value="MPXV,hMPXV" />
             <conditional name="release">

diff --git a/data_managers/data_manager_nextclade/test-data/nextclade.loc b/data_managers/data_manager_nextclade/test-data/nextclade.loc
@@ -9,4 +9,4 @@
 # the database_name refers to which dataset names that, in general, align with species names. for more details see the output of "nextclade dataset list"
 #
 # for example
-#sars-cov-2_2022-06-14T12:00:00Z	sars-cov-2	SARS-CoV-2	1.10.0	2022-06-14T12:00:00	/srv/galaxy/tool-data/nextclade/sars-cov-2_2022-06-14T12-00-00Z
+#sars-cov-2_2025-05-05--13-15-29Z	sars-cov-2	SARS-CoV-2	3.0.0-alpha.0	2025-05-05T13:15:29	/tmp/tmpildayi8q/galaxy-dev/tool-data/nextclade/sars-cov-2_2025-05-05--13-15-29Z
diff --git a/tools/nextclade/README.md b/tools/nextclade/README.md
@@ -0,0 +1,16 @@
+### Updating the generated_macros.xml
+
+Nextclade outputs differs depending on which dataset is used for clade assignment. To deal with this, the `datasets_to_macros.py` script runs nextclade to get a list of datasets and then runs nextclade with sample data and collects the expected columns from each output dataset. This information is used to generation macros, thus:
+
+```
+# requires nextclade to be in the path
+./datasets_to_macros.py generated_macros.xml
+```
+
+Should be run before updating the nextclade tool. Note that there are a few special cases in here:
+
+1. There are two sets of Influenza datasets with the same name, but different reference sequences. These are special cased in the code to distinguish them.
+
+2. Some information about the SARS-CoV-2 and MPXV is used for testing, thus tokens are created specifically containing info about these datasets.
+
+The `generated_macros.xml` now also includes the tool version, to ensure that the generated data matches the version of nextclade used to generate it.