Skip to content

Commit 8841acf

Browse files
authored
adding rule to intersect bed (#65)
* when using bedtools intersect, keep only the highest intersection of annotation * Update merge_annotations.nf * updating versioning information * change modules to use named outputs for better readability * fixed optional emissions * fix awk print statement * Update CHANGELOG.md * Update CHANGELOG.md
1 parent 620b3a5 commit 8841acf

40 files changed

+277
-343
lines changed

.zenodo.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"description": "<p>The pipeline</p>\n\n<p>bacannot, is a customisable, easy to use, pipeline that uses state-of-the-art software for comprehensively annotating prokaryotic genomes having only Docker and Nextflow as dependencies. It is able to annotate and detect virulence and resistance genes, plasmids, secondary metabolites, genomic islands, prophages, ICEs, KO, and more, while providing nice an beautiful interactive documents for results exploration.</p>",
33
"license": "other-open",
44
"title": "fmalmeida/bacannot: A generic but comprehensive bacterial annotation pipeline",
5-
"version": "v3.1.4",
5+
"version": "v3.1.5",
66
"upload_type": "software",
77
"creators": [
88
{

markdown/CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
The tracking for changes started in v2.1
44

5+
## v3.1.5 [17-September-2022]
6+
7+
* Fixes https://github.com/fmalmeida/bacannot/issues/64 reported by @fmalmeida, which highlights that the resfinder annotation were sometimes being duplicated because a single gene had intersection to more then one CDS regions, but, only one being "true".
8+
- To solve such issue, intersections are now sorted by length, and only the first occurence (per gene) is kept.
9+
* As reported in issue #39, named outputs are now used in modules for better readability.
10+
511
## v3.1.4 [13-September-2022]
612

713
* Fixes https://github.com/fmalmeida/bacannot/issues/62 reported by @rujinlong, where Island-Path tool was failling because it was running on genbank files with no true CDS. This was hapenning because Bakta writes in the comments that the GBK has 0 CDS and, at first, the module was selecting GBK by checking if the CDS string was there. It has now been modified to also work with Bakta.

modules/KOs/kegg-decoder.nf

+2-3
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ process KEGG_DECODER {
77
tuple val(prefix), path('input_mapper.txt')
88

99
output:
10-
// Grab all outputs
11-
path("*") // Get all files to input directory
12-
tuple val(prefix), path("*.svg") // get svg
10+
path("*") , emit: all // Get all files to input directory
11+
tuple val(prefix), path("*.svg"), emit: results // get svg
1312

1413
script:
1514
"""

modules/KOs/kofamscan.nf

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ process KOFAMSCAN {
1212

1313
output:
1414
// Grab all outputs
15-
file("KOfamscan")
16-
tuple val(prefix), file("KOfamscan/${prefix}_ko_forKEGGMapper.txt")
15+
path("KOfamscan"), emit: all
16+
tuple val(prefix), path("KOfamscan/${prefix}_ko_forKEGGMapper.txt"), emit: results
1717

1818
script:
1919
"""

modules/MGEs/digIS.nf

+3-4
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@ process DIGIS {
1212
tuple val(prefix), path(genome), path(genbank)
1313

1414
output:
15-
// Grab results
16-
file("digIS")
17-
tuple val(prefix), path("digIS/results/${prefix}.gff")
18-
tuple val(prefix), path("${prefix}_IS.gff"), path("digIS/results/fastas/${prefix}_IS.fa"), path("digIS/results/fastas/${prefix}_IS.faa")
15+
path("digIS") , emit: all
16+
tuple val(prefix), path("digIS/results/${prefix}.gff"), emit: gff
17+
tuple val(prefix), path("${prefix}_IS.gff"), path("digIS/results/fastas/${prefix}_IS.fa"), path("digIS/results/fastas/${prefix}_IS.faa"), emit: gff_and_sequences
1918

2019
script:
2120
"""

modules/MGEs/draw_gis.nf

+2-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,8 @@ process DRAW_GIS {
1111
tuple val(prefix), file(gff), file(gis_bed)
1212

1313
output:
14-
// Outputs must be linked to each prefix (tag)
15-
tuple val(prefix), file("plots") optional true
16-
tuple val(prefix), file("teste.png") optional true
14+
tuple val(prefix), file("plots") , emit: all optional true
15+
tuple val(prefix), file("teste.png"), emit: example optional true
1716

1817
script:
1918
"""

modules/MGEs/iceberg.nf

+4-5
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,10 @@ process ICEBERG {
99
file(bacannot_db)
1010

1111
output:
12-
// Outputs must be linked to each prefix (tag)
13-
tuple val(prefix), file("${prefix}_iceberg_blastp_onGenes.summary.txt")
14-
tuple val(prefix), file("${prefix}_iceberg_blastp_onGenes.txt")
15-
tuple val(prefix), file("${prefix}_iceberg_blastn_onGenome.summary.txt")
16-
file('*.txt') // Grab all
12+
tuple val(prefix), path("${prefix}_iceberg_blastp_onGenes.summary.txt") , emit: genes_summary
13+
tuple val(prefix), path("${prefix}_iceberg_blastp_onGenes.txt") , emit: results
14+
tuple val(prefix), path("${prefix}_iceberg_blastn_onGenome.summary.txt"), emit: genome_summary
15+
path('*.txt') , emit: all
1716

1817
script:
1918
"""

modules/MGEs/islandpath.nf

+1-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@ process ISLANDPATH {
77
tuple val(prefix), file("annotation.gbk")
88

99
output:
10-
// Outputs must be linked to each prefix (tag)
11-
tuple val(prefix), path("${prefix}_predicted_GIs.bed")
10+
tuple val(prefix), path("${prefix}_predicted_GIs.bed"), emit: results
1211

1312
script:
1413
"""

modules/MGEs/plasmidfinder.nf

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ process PLASMIDFINDER {
1111
file(bacannot_db)
1212

1313
output:
14-
tuple val(prefix), path("plasmidfinder")
15-
tuple val(prefix), path("plasmidfinder/results_tab.tsv")
14+
tuple val(prefix), path("plasmidfinder") , emit: all
15+
tuple val(prefix), path("plasmidfinder/results_tab.tsv"), emit: results
1616

1717
script:
1818
"""

modules/MGEs/platon.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ process PLATON {
1212
file(bacannot_db)
1313

1414
output:
15-
path("platon")
16-
tuple val(prefix), path("platon/${prefix}.tsv")
17-
path("platon_version.txt")
15+
path("platon") , emit: all
16+
tuple val(prefix), path("platon/${prefix}.tsv"), emit: results
17+
path("platon_version.txt") , emit: version
1818

1919
script:
2020
"""

modules/assembly/flye.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ process FLYE {
1111
tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species)
1212

1313
output:
14-
file "flye_${prefix}" // Saves all files
14+
path "flye_${prefix}", emit: all // Saves all files
1515
// Keep tuple structure to mixing channels
16-
tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("flye_${prefix}.fasta"), val("${resfinder_species}")
17-
file('flye_version.txt')
16+
tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), path("${lreads}"), val("${lr_type}"), path("${fast5}"), path("flye_${prefix}.fasta"), val("${resfinder_species}"), emit: results
17+
path('flye_version.txt'), emit: version
1818

1919
script:
2020
lr = (lr_type == 'nanopore') ? '--nano-raw' : '--pacbio-raw'

modules/assembly/unicycler.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ process UNICYCLER {
1111
tuple val(prefix), val(entrypoint), file(sread1), file(sread2), file(sreads), file(lreads), val(lr_type), file(fast5), val(assembly), val(resfinder_species)
1212

1313
output:
14-
file "unicycler_${prefix}" // Save everything
14+
path "unicycler_${prefix}", emit: all // Save everything
1515
// Keep tuple structure to mixing channels
16-
tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), file("${lreads}"), val("${lr_type}"), file("${fast5}"), file("unicycler_${prefix}.fasta"), val("${resfinder_species}")
17-
file('unicycler_version.txt')
16+
tuple val("${prefix}"), val("${entrypoint}"), val("${sread1}"), val("${sread2}"), val("${sreads}"), path("${lreads}"), val("${lr_type}"), path("${fast5}"), path("unicycler_${prefix}.fasta"), val("${resfinder_species}"), emit: results
17+
path('unicycler_version.txt'), emit: version
1818

1919
script:
2020
unpaired_param = (sreads.getName() != "input.3") ? "-s $sreads" : ""

modules/generic/antismash.nf

+3-4
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,9 @@ process ANTISMASH {
1111
file(bacannot_db)
1212

1313
output:
14-
// Grab results
15-
tuple val(prefix), path("antiSMASH/regions.gff")
16-
path("antiSMASH")
17-
path("*_version.txt")
14+
tuple val(prefix), path("antiSMASH/regions.gff"), emit: gff
15+
path("antiSMASH") , emit: all
16+
path("*_version.txt") , emit: version
1817

1918
script:
2019
def gbk_suffix = (params.bakta_db) ? "gbff" : "gbk"

modules/generic/bakta.nf

+10-10
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@ process BAKTA {
1313

1414
output:
1515
// Grab all outputs
16-
file "annotation"
16+
path "annotation", emit: all
1717
// Outputs must be linked to each prefix (tag)
18-
tuple val(prefix), file("annotation/${prefix}.gff3") // annotation in gff format
19-
tuple val(prefix), file("annotation/${prefix}.gbff") // annotation in gbk format
20-
tuple val(prefix), file("annotation/${prefix}.fna") // renamed genome
21-
tuple val(prefix), file("annotation/${prefix}.faa") // gene aa sequences
22-
tuple val(prefix), file("annotation/${prefix}.ffn") // gene nt sequences
23-
tuple val(prefix), file("annotation/${prefix}.fna"), file("${lreads}"), file("${fast5}") // For methylation calling
24-
tuple val(prefix), file("annotation/${prefix}.fna"), val("${resfinder_species}") // For resfinder
25-
tuple val(prefix), file("annotation/${prefix}.txt") // bakta stats
26-
file('bakta_version.txt') // Save bakta version
18+
tuple val(prefix), path("annotation/${prefix}.gff3"), emit: gff // annotation in gff format
19+
tuple val(prefix), path("annotation/${prefix}.gbff"), emit: gbk // annotation in gbk format
20+
tuple val(prefix), path("annotation/${prefix}.fna") , emit: genome // renamed genome
21+
tuple val(prefix), path("annotation/${prefix}.faa") , emit: proteins // gene aa sequences
22+
tuple val(prefix), path("annotation/${prefix}.ffn") , emit: genes // gene nt sequences
23+
tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}"), emit: genome_with_fast5 // For methylation calling
24+
tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}"), emit: genome_with_species // For resfinder
25+
tuple val(prefix), path("annotation/${prefix}.txt") , emit: summary // bakta stats
26+
path('bakta_version.txt'), emit: version // Save bakta version
2727

2828
script:
2929
"""

modules/generic/barrnap.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ process BARRNAP {
1010
tuple val(prefix), file(genome)
1111

1212
output:
13-
tuple val(prefix), path("${prefix}_rRNA.gff")
14-
tuple val(prefix), path("${prefix}_rRNA.fa")
15-
path('barrnap_version.txt')
13+
tuple val(prefix), path("${prefix}_rRNA.gff"), emit: gff
14+
tuple val(prefix), path("${prefix}_rRNA.fa") , emit: fasta
15+
path('barrnap_version.txt') , emit: version
1616

1717
script:
1818
"""

modules/generic/compute_gc.nf

+1-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ process COMPUTE_GC {
66
tuple val(prefix), file(genome)
77

88
output:
9-
// Outputs must be linked to each prefix (tag)
10-
tuple val(prefix), path("input_GC_500_bps.sorted.bedGraph"), path("input.sizes")
9+
tuple val(prefix), path("input_GC_500_bps.sorted.bedGraph"), path("input.sizes"), emit: results
1110

1211
script:
1312
"""

modules/generic/custom_database.nf

+4-5
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,10 @@ process CUSTOM_DATABASE {
88
each file(customDB)
99

1010
output:
11-
// Outputs must be linked to each prefix (tag)
12-
tuple val(prefix), val("${customDB.baseName}"), path("${prefix}_${customDB.baseName}*.summary.txt")
13-
tuple val(prefix), path("${customDB.baseName}_custom_db.gff")
14-
path('*.txt') // Grab all
15-
path(customDB)
11+
tuple val(prefix), val("${customDB.baseName}"), path("${prefix}_${customDB.baseName}*.summary.txt"), emit: summary
12+
tuple val(prefix), path("${customDB.baseName}_custom_db.gff") , emit: gff
13+
path('*.txt') , emit: all
14+
path(customDB) , emit: db
1615

1716
script:
1817
"""

modules/generic/custom_database_report.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ process CUSTOM_DATABASE_REPORT {
1010
tuple val(prefix), val(customDB), file(custom_blast), file(custom_gff)
1111

1212
output:
13-
file '*.html'
13+
path '*.html', emit: results
1414

1515
script:
1616
"""

modules/generic/gff2gbk.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ process GFF2GBK {
77
tuple val(prefix), file(gff), file(input)
88

99
output:
10-
file "*.genbank"
10+
path "*.genbank", emit: results
1111

1212
"""
1313
# Activate env

modules/generic/gff2sql.nf

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ process CREATE_SQL {
1010
tuple val(prefix), file(gff), file(genes_nt), file(genes_aa), file(genome), file("digIS.gff"), file("digIS.fa"), file("digIS.faa")
1111

1212
output:
13-
file "${prefix}.sqlite"
14-
file "run_server.sh"
13+
path "${prefix}.sqlite", emit: results
14+
path "run_server.sh" , emit: script
1515

1616
script:
1717
"""

modules/generic/jbrowse.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ process JBROWSE {
77
tuple val(prefix), file(merged_gff), file(draft), file("prokka_gff"), file(barrnap), file(gc_bedGraph), file(gc_chrSizes), file(resfinder_gff), file(phigaro), file(genomic_islands), file("methylation"), file("chr.sizes"), file(phispy_tsv), file(digIS_gff), file(antiSMASH), file(custom_annotations)
88

99
output:
10-
file "*"
10+
path "*", emit: results
1111

1212
script:
1313
"""

modules/generic/mash.nf

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ process REFSEQ_MASHER {
1010
tuple val(prefix), path(genome)
1111

1212
output:
13-
// Grab results
14-
tuple val(prefix), path("refseq_masher_results.txt")
15-
path("*_version.txt")
13+
tuple val(prefix), path("refseq_masher_results.txt"), emit: results
14+
path("*_version.txt") , emit: version
1615

1716
script:
1817
"""

modules/generic/merge_annotations.nf

+6-6
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ process MERGE_ANNOTATIONS {
77
tuple val(prefix), file('prokka_gff'), file(kofamscan), file(vfdb), file(victors), file(amrfinder), file(resfinder), file(rgi), file(iceberg), file(phast), file('digis_gff'), file(custom_databases)
88

99
output:
10-
tuple val(prefix), path("${prefix}.gff")
11-
tuple val(prefix), path("transposable_elements_digis.gff")
12-
tuple val(prefix), path("custom_database_*.gff") optional true
13-
path("*.gff")
10+
tuple val(prefix), path("${prefix}.gff") , emit: gff
11+
tuple val(prefix), path("transposable_elements_digis.gff"), emit: digis_gff
12+
tuple val(prefix), path("custom_database_*.gff") , emit: customdb_gff optional true
13+
path("*.gff") , emit: all
1414

1515
script:
1616
"""
@@ -81,7 +81,7 @@ process MERGE_ANNOTATIONS {
8181
#### Resfinder
8282
if [ ! \$(cat $resfinder | wc -l) -eq 0 ]
8383
then
84-
bedtools intersect -a $resfinder -b ${prefix}.gff -wo > resfinder_intersected.txt ;
84+
bedtools intersect -a $resfinder -b ${prefix}.gff -wo | sort -k19,19 -r | awk -F '\\t' '!seen[\$9]++' > resfinder_intersected.txt ;
8585
addBedtoolsIntersect.R -g ${prefix}.gff -t resfinder_intersected.txt --type Resistance --source Resfinder -o ${prefix}.gff ;
8686
grep "Resfinder" ${prefix}.gff > resistance_resfinder.gff ;
8787
rm -f resfinder_intersected.txt ;
@@ -93,7 +93,7 @@ process MERGE_ANNOTATIONS {
9393
if [ ! \$(cat \$file | wc -l) -eq 0 ]
9494
then
9595
db=\${file%%_custom_db.gff} ;
96-
bedtools intersect -a \${file} -b ${prefix}.gff -wo > bedtools_intersected.txt ;
96+
bedtools intersect -a \${file} -b ${prefix}.gff -wo | sort -k19,19 -r | awk -F '\\t' '!seen[\$9]++' > bedtools_intersected.txt ;
9797
addBedtoolsIntersect.R -g ${prefix}.gff -t bedtools_intersected.txt --type "CDS" --source "\${db}" -o ${prefix}.gff ;
9898
grep "\${db}" ${prefix}.gff > custom_database_\${db}.gff ;
9999
rm -f bedtools_intersected.txt ;

modules/generic/methylation.nf

+5-6
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,11 @@ process CALL_METHYLATION {
1010
tuple val(prefix), file(draft), file(reads), file(fast5)
1111

1212
output:
13-
// Grab all outputs
14-
file "*_calls.tsv" optional true
15-
file "*_frequency.tsv" optional true
16-
tuple val(prefix), file("methylation_frequency.bedGraph") optional true
17-
tuple val(prefix), file("chr.sizes") optional true
18-
file('nanopolish_version.txt')
13+
path "*_calls.tsv" , emit: results optional true
14+
path "*_frequency.tsv" , emit: frequencies optional true
15+
tuple val(prefix), path("methylation_frequency.bedGraph"), emit: bedgraph optional true
16+
tuple val(prefix), path("chr.sizes") , emit: chr_sizes optional true
17+
path('nanopolish_version.txt') , emit: version
1918

2019
when:
2120
// When an entry does not exist, it is created as 'input'

modules/generic/mlst.nf

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ process MLST {
1111
file(bacannot_db)
1212

1313
output:
14-
tuple val(prefix), path("${prefix}_mlst_analysis.txt") optional true
15-
tuple val(prefix), path("${prefix}_novel_alleles.fasta") optional true
16-
path('mlst_version.txt')
14+
tuple val(prefix), path("${prefix}_mlst_analysis.txt") , emit: results optional true
15+
tuple val(prefix), path("${prefix}_novel_alleles.fasta"), emit: alleles optional true
16+
path('mlst_version.txt') , emit: version
1717

1818
script:
1919
"""

modules/generic/ncbi_protein.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ process GET_NCBI_PROTEIN {
55
file(ncbi_accs)
66

77
output:
8-
path("ncbi_protein.faa")
8+
path("ncbi_protein.faa"), emit: proteins
99

1010
script:
1111
"""

modules/generic/prokka.nf

+10-10
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@ process PROKKA {
1313

1414
output:
1515
// Grab all outputs
16-
path("annotation")
16+
path("annotation"), emit: all
1717
// Outputs must be linked to each prefix (tag)
18-
tuple val(prefix), path("annotation/${prefix}.gff")
19-
tuple val(prefix), path("annotation/${prefix}.gbk")
20-
tuple val(prefix), path("annotation/${prefix}.fna")
21-
tuple val(prefix), path("annotation/${prefix}.faa")
22-
tuple val(prefix), path("annotation/${prefix}.ffn")
23-
tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}")
24-
tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}")
25-
tuple val(prefix), path("annotation/${prefix}.txt")
26-
path('prokka_version.txt')
18+
tuple val(prefix), path("annotation/${prefix}.gff"), emit: gff
19+
tuple val(prefix), path("annotation/${prefix}.gbk"), emit: gbk
20+
tuple val(prefix), path("annotation/${prefix}.fna"), emit: genome
21+
tuple val(prefix), path("annotation/${prefix}.faa"), emit: proteins
22+
tuple val(prefix), path("annotation/${prefix}.ffn"), emit: genes
23+
tuple val(prefix), path("annotation/${prefix}.fna"), path("${lreads}"), path("${fast5}"), emit: genome_with_fast5
24+
tuple val(prefix), path("annotation/${prefix}.fna"), val("${resfinder_species}"), emit: genome_with_species
25+
tuple val(prefix), path("annotation/${prefix}.txt"), emit: summary
26+
path('prokka_version.txt'), emit: version
2727

2828
script:
2929
kingdom = (params.prokka_kingdom) ? "--kingdom ${params.prokka_kingdom}" : ''

modules/generic/reports.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ process REPORT {
77
tuple val(prefix), file('annotation_stats.tsv'), file(gff), file(barrnap), file(mlst), file(keggsvg), file(refseq_masher_txt), file(amrfinder), file(rgi), file(rgi_parsed), file(rgi_heatmap), file(argminer_out), file(resfinder_tab), file(resfinder_point), file(resfinder_phenotable), file(vfdb_blastn), file(victors_blastp), file(phigaro_txt), file(phispy_tsv), file(iceberg_blastp), file(iceberg_blastn), file(plasmids_tsv), file(platon_tsv), file(gi_image), file(phast_blastp), file(digIS)
88

99
output:
10-
file '*.html'
10+
path '*.html', emit: results
1111

1212
script:
1313
def generic_annotator = (params.bakta_db) ? "bakta" : "prokka"

0 commit comments

Comments
 (0)