Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
eeaeb96
Update nextclade to 3.15.3
pvanheus Jul 21, 2025
cee48cb
Fix include-reference and MPXV_COLUMNS
pvanheus Jul 21, 2025
9633370
Merge branch 'main' of https://github.com/galaxyproject/tools-iuc int…
pvanheus Jul 21, 2025
2471d02
Move version to generated_macros.xml, add README.md
pvanheus Jul 21, 2025
e465cb3
Removed unused import
pvanheus Jul 21, 2025
44fd9d0
Fix flake8 error - blank lines
pvanheus Jul 21, 2025
7dd5acd
Remove line break
pvanheus Jul 21, 2025
7f7187d
Add support for nextclade datasets provided as datasets
pvanheus Jul 21, 2025
51f78ce
Update version command
pvanheus Jul 21, 2025
32d7cd1
Add yellow fever virus test data
pvanheus Jul 21, 2025
b3fbbaf
Add nextclade_dataset_get tool
pvanheus Jul 21, 2025
3c7a33f
Add nextclade link
pvanheus Jul 21, 2025
f15b0c9
Add check to see if nextclade is in PATH
pvanheus Jul 21, 2025
51d43bc
Update compatibility spec to 3.0.0
pvanheus Jul 21, 2025
30d57a5
Correct regexp
pvanheus Jul 21, 2025
3b08959
Escape periods
pvanheus Jul 21, 2025
314b3de
Update nextclade datamanager for nextclade version 3
pvanheus Jul 22, 2025
92812cb
Add link to generated_macros.xml, reformat python
pvanheus Jul 22, 2025
e3719c7
Add homepage_url
pvanheus Jul 22, 2025
26f200f
Set profile 24.0
pvanheus Jul 22, 2025
014246c
Attempt to use galaxy.json to set columns names
pvanheus Jul 24, 2025
17c1d50
Fix dynamic generation of column header metadata
pvanheus Jul 24, 2025
9d6dab0
Fix flake8 problem
pvanheus Jul 24, 2025
53fb368
Update to Nextclade version 3.16.0
pvanheus Aug 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data_managers/data_manager_nextclade/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ long_description: |
name: data_manager_nextclade
owner: iuc
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_nextclade
homepage_url: https://github.com/nextstrain/nextclade
type: unrestricted
81 changes: 52 additions & 29 deletions data_managers/data_manager_nextclade/data_manager/nextclade_dm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
from typing import List


def get_nextclade_version() -> str:
# Returns the version of the nextclade CLI tool.
version_cmd = ["nextclade", "--version"]
version_proc = subprocess.run(version_cmd, capture_output=True, check=True)
version_output = version_proc.stdout.decode("utf-8").strip()
return version_output.split()[1] # e.g., "nextclade 2.3.0"


def parse_date(d: str) -> datetime.datetime:
# Parses the publication date from the nextclade release tags or user input into a datetime object.
date = None
Expand All @@ -20,46 +28,57 @@ def parse_date(d: str) -> datetime.datetime:
return date


def entry_to_tag(entry: dict) -> str:
return (
entry["attributes"]["name"]["value"] + "_" + entry["attributes"]["tag"]["value"]
)
def entry_to_tag(name: str, tag: str) -> str:
return "_".join([name, tag])


def get_database_list() -> List[dict]:
list_cmd = [
"nextclade",
"dataset",
"list",
"--json",
"--include-old",
"--include-incompatible",
]
def get_database_list(default_compatibility: str) -> List[dict]:
list_cmd = ["nextclade", "dataset", "list", "--json"]
list_proc = subprocess.run(list_cmd, capture_output=True, check=True)
database_list = json.loads(list_proc.stdout)
entry_list = []
for db_entry in database_list:
attributes = db_entry["attributes"]
entry = {
"value": entry_to_tag(db_entry),
"database_name": attributes["name"]["value"],
"description": attributes["name"]["valueFriendly"],
"date": datetime.datetime.fromisoformat(
attributes["tag"]["value"].replace("Z", "")
),
"tag": attributes["tag"]["value"],
"min_nextclade_version": db_entry["compatibility"]["nextcladeCli"]["min"],
}
entry_list.append(entry)
name = db_entry["path"]
if "shortcuts" in db_entry:
name = db_entry["shortcuts"][0]
description = attributes["name"]
if (
"CY121680" in db_entry["path"] and description == "Influenza A H1N1pdm HA"
) or ("CY163680" in db_entry["path"] and description == "Influenza A H3N2 HA"):
description += " (broad)"
if name.startswith("community/"):
description += " (community contributed)"

for version in db_entry["versions"]:
version_date = datetime.datetime.fromisoformat(
version["updatedAt"].replace("Z", "")
)
tag = version["tag"]
if "compatibility" not in version:
version["compatibility"] = {
"cli": default_compatibility,
"web": default_compatibility,
}
entry = {
"value": entry_to_tag(name, tag),
"database_name": name,
"description": description,
"date": version_date,
"tag": version["tag"],
"min_nextclade_version": version["compatibility"]["cli"],
}
entry_list.append(entry)

return entry_list


def filter_by_date(
existing_release_tags: List[str],
existing_release_tags: set[str],
name: str,
releases: list,
start_date: datetime.datetime = None,
end_date: datetime.datetime = None,
start_date: datetime.datetime | None = None,
end_date: datetime.datetime | None = None,
) -> List[dict]:
ret = []
for release in releases:
Expand Down Expand Up @@ -108,6 +127,7 @@ def comma_split(args: str) -> List[str]:
parser.add_argument("--end_date", type=parse_date)
parser.add_argument("--known_revisions", type=comma_split)
parser.add_argument("--datasets", type=comma_split, default=["sars-cov-2"])

parser.add_argument("datatable_name", default="nextclade")
parser.add_argument("galaxy_config")
args = parser.parse_args()
Expand All @@ -118,13 +138,16 @@ def comma_split(args: str) -> List[str]:
else:
existing_release_tags = set()

releases_available = get_database_list()
nextclade_version = get_nextclade_version()
major_version = nextclade_version.split(".")[0]
default_compatibility = f"{major_version}.0.0"
releases_available = get_database_list(default_compatibility)
if args.testmode:
releases = []
for name in args.datasets:
releases.extend(
filter_by_date(
[],
set(),
name,
releases_available,
start_date=args.start_date,
Expand Down
34 changes: 15 additions & 19 deletions data_managers/data_manager_nextclade/data_manager/nextclade_dm.xml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
<tool id="data_manager_nextclade" name="nextclade data manager" version="0.0.1+galaxy0" tool_type="manage_data" profile="20.01">

<tool id="data_manager_nextclade" name="nextclade data manager" version="0.0.2+galaxy0" tool_type="manage_data" profile="24.0">
<macros>
<import>generated_macros.xml</import>
</macros>
<requirements>
<requirement type="package" version="3.8">python</requirement>
<requirement type="package" version="2.3.0">nextclade</requirement>
<requirement type="package" version="3.12">python</requirement>
<requirement type="package" version="@TOOL_VERSION@">nextclade</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
#set $data_table = $__app__.tool_data_tables.get('nextclade')
Expand Down Expand Up @@ -38,15 +40,7 @@
]]></command>
<inputs>
<param name="datasets" type="select" label="Select nextclade datasets" multiple="true">
<option value="sars-cov-2" selected="true">SARS-CoV-2</option>
<option value="MPXV">Monkeypox (All Clades)</option>
<option value="hMPXV">Human Monkeypox (hMPXV)</option>
<option value="hMPXV_B1">Human Monkeypox Clade B.1</option>
<option value="flu_h1n1pdm_ha">Influenza A H1N1pdm HA</option>
<option value="flu_h3n2_ha">Influenza A H3N2 HA</option>
<option value="flu_vic_ha">Influenza B Victoria HA</option>
<option value="flu_yam_ha">Influenza B Yamagata HA</option>
<option value="sars-cov-2-no-recomb">SARS-CoV-2 without recombinants</option>
<expand macro="dataset_selector" />
</param>
<param name="additional_datasets" type="text" label="Additional nextclade dataset names" help="If you want to download datasets that are not in the list above, enter their names here, separated by commas">
<validator type="regex" message="Dataset names consist of letters, numbers, underscore and hyphens, with multiple names separated by ,">^[-A-Za-z0-9_]?[-A-Za-z0-9_,]*$</validator>
Expand All @@ -72,22 +66,24 @@
<data name="output_file" format="data_manager_json"/>
</outputs>
<tests>
<!-- test1 -->
<test expect_num_outputs="1">
<conditional name="release">
<param name="which" value="date_range" />
<param name="start_date" value="2022-03-01" />
<param name="end_date" value="2022-04-01" />
<param name="start_date" value="2025-03-04" />
<param name="end_date" value="2025-05-06" />
</conditional>
<output name="output_file">
<assert_contents>
<has_text text='"database_name": "sars-cov-2"' />
<has_text text='sars-cov-2_2022-03-31T12-00-00Z' />
<has_text text='sars-cov-2_2022-03-24T12-00-00Z' />
<has_text text='sars-cov-2_2022-03-14T12-00-00Z"' />
<has_text text='"min_nextclade_version": "1.10.0"' />
<has_text text='sars-cov-2_2025-03-26--11-47-13Z' />
<has_text text='sars-cov-2_2025-04-01--08-20-12Z' />
<has_text text='sars-cov-2_2025-05-05--13-15-29Z' />
<has_text text='"min_nextclade_version": "3.0.0' />
</assert_contents>
</output>
</test>
<!-- test2 -->
<test expect_num_outputs="1">
<param name="datasets" value="MPXV,hMPXV" />
<conditional name="release">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
# the database_name refers to which dataset names that, in general, align with species names. for more details see the output of "nextclade dataset list"
#
# for example
#sars-cov-2_2022-06-14T12:00:00Z sars-cov-2 SARS-CoV-2 1.10.0 2022-06-14T12:00:00 /srv/galaxy/tool-data/nextclade/sars-cov-2_2022-06-14T12-00-00Z
#sars-cov-2_2025-05-05--13-15-29Z sars-cov-2 SARS-CoV-2 3.0.0-alpha.0 2025-05-05T13:15:29 /tmp/tmpildayi8q/galaxy-dev/tool-data/nextclade/sars-cov-2_2025-05-05--13-15-29Z
16 changes: 16 additions & 0 deletions tools/nextclade/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
### Updating the generated_macros.xml

Nextclade outputs differs depending on which dataset is used for clade assignment. To deal with this, the `datasets_to_macros.py` script runs nextclade to get a list of datasets and then runs nextclade with sample data and collects the expected columns from each output dataset. This information is used to generation macros, thus:

```
# requires nextclade to be in the path
./datasets_to_macros.py generated_macros.xml
```

Should be run before updating the nextclade tool. Note that there are a few special cases in here:

1. There are two sets of Influenza datasets with the same name, but different reference sequences. These are special cased in the code to distinguish them.

2. Some information about the SARS-CoV-2 and MPXV is used for testing, thus tokens are created specifically containing info about these datasets.

The `generated_macros.xml` now also includes the tool version, to ensure that the generated data matches the version of nextclade used to generate it.
Loading