diff --git a/.gitignore b/.gitignore index e75900d..961b31c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ .nextflow* .nf-test* +__pycache__/ +*.pyc +.python-version \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/environment.yml b/modules/ensembl/fasta/recombine/environment.yml new file mode 100644 index 0000000..94089f3 --- /dev/null +++ b/modules/ensembl/fasta/recombine/environment.yml @@ -0,0 +1,6 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf new file mode 100644 index 0000000..d14d71b --- /dev/null +++ b/modules/ensembl/fasta/recombine/main.nf @@ -0,0 +1,66 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FASTA_RECOMBINE { + + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + + input: + tuple val(meta), path(fasta_manifest), path(agp) + + output: + tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta + tuple val("${task.process}"), val('fasta_recombine'), eval("fasta_recombine --version"), emit: versions_fasta_recombine, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = [] + + if (params.chunk_id_regex) { + def rx = params.chunk_id_regex.replace("'", "'\"'\"'") + args << "--chunk-id-regex '${rx}'" + } + + if (params.allow_revcomp) { + args << "--allow-revcomp" + } + + def has_agp = agp && agp.baseName != 'NO_FILE' + if (has_agp) { + args << "--agp-file ${agp}" + } + + def out_fasta = "${meta.id}.fa" + """ + fasta_recombine \\ + --fasta-manifest ${fasta_manifest} \\ + --out-fasta ${out_fasta} \\ + ${args.join(' ')} + """ + + stub: + """ + out_fa="${meta.id}.fa" + touch "\$out_fa" + """ +} diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml new file mode 100644 index 0000000..7c7aec4 --- /dev/null +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -0,0 +1,71 @@ +name: "fasta_recombine" +description: Recombine split FASTA sequences into a single FASTA file, + optionally using an AGP file. +keywords: + - ensembl + - fasta + - genomics + - genomio + - recombine +tools: + - "fasta_recombine": + description: "Recombine split FASTA sequences generated by ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: + - "Apache License version 2.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - fasta_manifest: + type: file + description: Manifest file listing split FASTA files to recombine. + pattern: "*.txt" + ontologies: [] + - agp: + type: file + description: + Optional AGP file describing how split sequence chunks should + be recombined. Use NO_FILE when not required. + pattern: "*.{agp,NO_FILE}" + ontologies: [] +output: + recombined_fasta: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - ${meta.id}.fa: + type: file + description: Recombined FASTA file. + pattern: "*.fa" + ontologies: [] + versions_fasta_recombine: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_recombine: + type: string + description: The name of the tool. + - ? fasta_recombine --version + : type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_recombine: + type: string + description: The name of the tool. + - ? fasta_recombine --version + : type: eval + description: The expression to obtain the version of the tool +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test new file mode 100644 index 0000000..4448cf7 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -0,0 +1,86 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nf-core modules test fasta/recombine +nextflow_process { + + name "Test Process FASTA_RECOMBINE" + script "../main.nf" + process "FASTA_RECOMBINE" + + tag "modules" + tag "modules_ensembl" + tag "fasta" + tag "fasta/recombine" + + test("stub outputs: header mode") { + + when { + options "-stub" + + process { + """ + def manifest = file("manifest.txt") + manifest.text = "x\\n" + + def no_file = file("NO_FILE") + no_file.text = "" + + input[0] = [ + [ id: 'test' ], + manifest, + no_file + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.recombined_fasta.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("stub outputs: AGP mode") { + + when { + options "-stub" + + process { + """ + def manifest = file("manifest.txt") + manifest.text = "x\\n" + + def agp = file("test.agp") + agp.text = "" + input[0] = [ + [ id: 'test' ], + manifest, + agp + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.recombined_fasta.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap new file mode 100644 index 0000000..0357999 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "stub outputs: AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.3" + ] + ], + "recombined_fasta": [ + [ + { + "id": "test" + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fasta_recombine": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:11.350698", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "stub outputs: header mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.3" + ] + ], + "recombined_fasta": [ + [ + { + "id": "test" + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_fasta_recombine": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:09.216174", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + } +} \ No newline at end of file diff --git a/modules/ensembl/fasta/split/environment.yml b/modules/ensembl/fasta/split/environment.yml new file mode 100644 index 0000000..94089f3 --- /dev/null +++ b/modules/ensembl/fasta/split/environment.yml @@ -0,0 +1,6 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf new file mode 100644 index 0000000..da643c7 --- /dev/null +++ b/modules/ensembl/fasta/split/main.nf @@ -0,0 +1,115 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FASTA_SPLIT { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + + input: + tuple val(meta), path(fasta), val(longest_seq_bp) + + output: + tuple val(meta), path("splits/**/*.fa"), emit: fastas + tuple val(meta), path("splits/*.agp"), emit: agp, optional: true + tuple val("${task.process}"), val('fasta_split'), eval("fasta_split --version"), emit: versions_fasta_split, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = [] + + if (params.max_seqs_per_file) { + args << "--max-seqs-per-file ${params.max_seqs_per_file}" + } + + if (params.max_seq_length_per_file) { + args << "--max-seq-length-per-file ${params.max_seq_length_per_file}" + } + + if (params.min_chunk_length) { + args << "--min-chunk-length ${params.min_chunk_length}" + } + + if (params.max_files_per_directory) { + args << "--max-files-per-directory ${params.max_files_per_directory}" + } + + if (params.max_dirs_per_directory) { + args << "--max-dirs-per-directory ${params.max_dirs_per_directory}" + } + + if (params.force_max_seq_length) { + args << "--force-max-seq-length" + } + + if (params.write_agp) { + args << "--write-agp" + } + + if (params.unique_file_names) { + args << "--unique-file-names" + } + + if (params.delete_existing_files) { + args << "--delete-existing-files" + } + + """ + fasta_split \\ + --fasta-file ${fasta} \\ + --out-dir splits \\ + ${args.join(' ')} + """ + + stub: + """ + layout="default" + if [[ "${params.unique_file_names ?: false}" == "true" ]]; then + layout="unique" + elif [[ -n "${params.max_dirs_per_directory ?: ''}" || -n "${params.max_files_per_directory ?: ''}" ]]; then + layout="multi_dir" + fi + + mkdir -p splits + + if [[ "\$layout" == "default" ]]; then + mkdir -p splits/0 + touch splits/0/test.1.fa + touch splits/0/test.2.fa + + elif [[ "\$layout" == "unique" ]]; then + mkdir -p splits/0 + touch splits/0/test.0.1.fa + touch splits/0/test.0.2.fa + + elif [[ "\$layout" == "multi_dir" ]]; then + mkdir -p splits/0/0 + mkdir -p splits/0/1 + touch splits/0/0/test.1.fa + touch splits/0/1/test.2.fa + fi + + if [[ "${params.write_agp ?: false}" == "true" ]]; then + touch "splits/${meta.id}.agp" + fi + + """ +} diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml new file mode 100644 index 0000000..a00fcba --- /dev/null +++ b/modules/ensembl/fasta/split/meta.yml @@ -0,0 +1,79 @@ +name: "fasta_split" +description: Split a FASTA file into smaller FASTA files and optionally write an + AGP file. +keywords: + - ensembl + - fasta + - genomics + - genomio + - split +tools: + - "fasta_split": + description: "Split FASTA files with ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: + - "Apache License version 2.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - fasta: + type: file + description: FASTA file to split. + pattern: "*.{fa,fasta,fna}" + ontologies: [] + - longest_seq_bp: + type: integer + description: Length in base pairs of the longest sequence in the input + FASTA. +output: + fastas: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - splits/**/*.fa: + type: file + description: Split FASTA files. + pattern: "splits/**/*.fa" + ontologies: [] + agp: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - splits/*.agp: + type: file + description: Optional AGP file describing split sequence chunks. + pattern: "splits/*.agp" + ontologies: [] + versions_fasta_split: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_split: + type: string + description: The name of the tool. + - ? fasta_split --version + : type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_split: + type: string + description: The name of the tool. + - ? fasta_split --version + : type: eval + description: The expression to obtain the version of the tool +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test new file mode 100644 index 0000000..042ff9c --- /dev/null +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -0,0 +1,182 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nf-core modules test fasta/split +nextflow_process { + + name "Test Process FASTA_SPLIT" + script "../main.nf" + process "FASTA_SPLIT" + + tag "modules" + tag "modules_ensembl" + tag "fasta" + tag "fasta/split" + + + test("stub outputs: default layout, no AGP") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + assert process.out.fastas != null + assert process.out.fastas.size() == 1 + + def fasta_out = process.out.fastas[0] + def meta = fasta_out[0] + def fas = fasta_out[1] + + assert meta.id == "test" + assert fas != null + assert fas.size() == 2 + assert fas.collect { path(it).toFile().name }.sort() == ["test.1.fa", "test.2.fa"] + + assert process.out.agp != null + assert process.out.agp.size() == 0 + + assertAll( + { assert process.success } + ) + } + } + + test("stub outputs: AGP optional output appears when enabled") { + + when { + options "-stub" + + params.write_agp = true + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + assert process.out.fastas.size() == 1 + def fasta_out = process.out.fastas[0] + def fas = fasta_out[1] + assert fas.size() == 2 + + assert process.out.agp.size() == 1 + def agp_out = process.out.agp[0] + def agp_meta = agp_out[0] + def agp = agp_out[1] + def agp_paths = agp instanceof List ? agp : [agp] + def agp_file = path(agp_paths[0]).toFile() + + assert agp_meta.id == "test" + assert agp_paths.size() == 1 + assert agp_file.name == "test.agp" + + assertAll( + { assert process.success } + ) + } + } + + test("stub outputs: unique_file_names contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = true + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fasta_out = process.out.fastas[0] + def fas = fasta_out[1] + + assert fas.size() == 2 + assert process.out.agp.size() == 0 + + // Contract check: names match the unique fixture pattern + assert fas.collect { path(it).toFile().name }.sort() == ["test.0.1.fa", "test.0.2.fa"] + + assertAll( + { assert process.success } + ) + } + } + + test("stub outputs: nested directory layout contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = false + + // Trigger stub's nested fixture selection + params.max_files_per_directory = 100 + params.max_dirs_per_directory = 100 + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fastas = process.out.fastas[0][1] + assert fastas.size() == 2 + assert process.out.agp.size() == 0 + + def rels = fastas.collect { path(it).toString() } + assert rels.any { it.contains("splits/0/0/") } + assert rels.any { it.contains("splits/0/1/") } + + assertAll( + { assert process.success } + ) + } + } +} diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap new file mode 100644 index 0000000..07ec1d6 --- /dev/null +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -0,0 +1,224 @@ +{ + "stub outputs: default layout, no AGP": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + + ], + "2": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ], + "agp": [ + + ], + "fastas": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:38:41.602246", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "stub outputs: AGP optional output appears when enabled": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ], + "agp": [ + [ + { + "id": "test" + }, + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastas": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:38:43.765608", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "stub outputs: nested directory layout contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + + ], + "2": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ], + "agp": [ + + ], + "fastas": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:38:48.132705", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "stub outputs: unique_file_names contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.0.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + + ], + "2": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ], + "agp": [ + + ], + "fastas": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.0.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:38:45.953655", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + } +} \ No newline at end of file diff --git a/modules/ensembl/features/combine_json/environment.yml b/modules/ensembl/features/combine_json/environment.yml new file mode 100644 index 0000000..94089f3 --- /dev/null +++ b/modules/ensembl/features/combine_json/environment.yml @@ -0,0 +1,6 @@ +--- +channels: + - conda-forge + - bioconda +dependencies: + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf new file mode 100644 index 0000000..e366038 --- /dev/null +++ b/modules/ensembl/features/combine_json/main.nf @@ -0,0 +1,126 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FEATURES_COMBINE_JSON { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + + input: + tuple val(meta), val(analysis), path(json_manifest), path(agp) + + output: + tuple val(meta), path("${meta.id}.${analysis}.json"), emit: combined_json + tuple val("${task.process}"), val('features_combine_json'), eval("features_combine_json --version"), emit: versions_features_combine_json, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = [] + + if (params.chunk_id_regex) { + def rx = params.chunk_id_regex.replace("'", "'\"'\"'") + args << "--chunk-id-regex '${rx}'" + } + + if (params.allow_revcomp) { + args << "--allow-revcomp" + } + + def has_agp = agp && agp.baseName != 'NO_FILE' + if (has_agp) { + args << "--agp-file '${agp}'" + } + + def out_json = "${meta.id}.${analysis}.json" + + """ + features_combine_json \\ + --json-manifest '${json_manifest}' \\ + --out-json '${out_json}' \\ + ${args.join(' ')} + """ + + stub: + """ + out_json="${meta.id}.${analysis}.json" + + test -s "${json_manifest}" + + agp_path="${agp}" + agp_name="\${agp_path##*/}" + + manifest_real="\$(python -c 'from pathlib import Path; import sys; print(Path(sys.argv[1]).resolve())' "${json_manifest}")" + manifest_dir="\$(dirname "\$manifest_real")" + + first_json="\$(head -n 1 "${json_manifest}")" + if [[ -z "\$first_json" ]]; then + echo "ERROR: manifest is empty: ${json_manifest}" >&2 + exit 1 + fi + if [[ "\$first_json" != /* ]]; then + first_json="\${manifest_dir}/\${first_json}" + fi + if [[ ! -s "\$first_json" ]]; then + echo "ERROR: first JSON in manifest does not exist or is empty: \$first_json" >&2 + exit 1 + fi + + if grep -q '"ncrna_features"' "\$first_json"; then + load_type="ncrna" + elif grep -q '"repeat_features"' "\$first_json"; then + load_type="repeat" + else + echo "ERROR: cannot detect load type from first JSON: \$first_json" >&2 + echo "Expected top-level key: 'repeat_features' or 'ncrna_features'." >&2 + exit 1 + fi + + if [[ "\$load_type" == "repeat" ]]; then + cat > "\$out_json" <<-EOF +{ + "analysis": { + "logic_name": "stub_repeat" + }, + "source": { + "source_provider": "stub" + }, + "repeat_consensus": [], + "repeat_features": [] +} +EOF + else + cat > "\$out_json" <<-EOF +{ + "analysis": { + "logic_name": "stub_ncrna" + }, + "source": { + "source_provider": "stub" + }, + "ncrna_tool": "stub", + "ncrna_features": [] + } +EOF + fi + + """ + +} diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml new file mode 100644 index 0000000..049fa1a --- /dev/null +++ b/modules/ensembl/features/combine_json/meta.yml @@ -0,0 +1,75 @@ +name: "features_combine_json" +description: Combine split feature JSON files into a single JSON file, + optionally using an AGP file. +keywords: + - ensembl + - features + - genomics + - genomio + - json +tools: + - "features_combine_json": + description: "Combine split feature JSON files generated by ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: + - "Apache License version 2.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - analysis: + type: string + description: Analysis name to include in the combined JSON filename. + - json_manifest: + type: file + description: Manifest file listing split JSON files to combine. + pattern: "*.txt" + ontologies: [] + - agp: + type: file + description: + Optional AGP file describing how split sequence chunks should + be recombined. Use NO_FILE when not required. + pattern: "*.{agp,NO_FILE}" + ontologies: [] +output: + combined_json: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - ${meta.id}.${analysis}.json: + type: file + description: Combined feature JSON file. + pattern: "*.json" + ontologies: + - edam: http://edamontology.org/format_3464 + versions_features_combine_json: + - - ${task.process}: + type: string + description: The name of the process. + - features_combine_json: + type: string + description: The name of the tool. + - ? features_combine_json --version + : type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process. + - features_combine_json: + type: string + description: The name of the tool. + - ? features_combine_json --version + : type: eval + description: The expression to obtain the version of the tool +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test new file mode 100644 index 0000000..eb6af6d --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -0,0 +1,163 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nf-core modules test features/combine_json +nextflow_process { + + name "Test Process FEATURES_COMBINE_JSON" + script "../main.nf" + process "FEATURES_COMBINE_JSON" + + tag "modules" + tag "modules_ensembl" + tag "features" + tag "features/combine_json" + + test("Stub outputs: repeat seq_region mode") { + + when { + options "-stub" + + process { + """ + def inJson = file("in.json") + inJson.text = '{"repeat_features": []}\\n' + + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" + + def noFile = file("NO_FILE") + noFile.text = "" + + input[0] = [ + [ id:'test' ], + 'features', + manifest, + noFile + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Stub outputs: ncRNA seq_region mode") { + + when { + options "-stub" + + process { + """ + def inJson = file("in.json") + inJson.text = '{"ncrna_features": [], "ncrna_tool": "cmscan"}\\n' + + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" + + def noFile = file("NO_FILE") + noFile.text = "" + + input[0] = [ + [ id:'test' ], + 'features', + manifest, + noFile + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Stub outputs: repeat AGP mode") { + + when { + options "-stub" + + process { + """ + def inJson = file("in.json") + inJson.text = '{"repeat_features": []}\\n' + + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" + + def agp = file("test.agp") + agp.text = "" + + input[0] = [ + [ id:'test' ], + 'features', + manifest, + agp + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Stub outputs: ncRNA AGP mode") { + + when { + options "-stub" + + process { + """ + def inJson = file("in.json") + inJson.text = '{"ncrna_features": [], "ncrna_tool": "cmscan"}\\n' + + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" + + def agp = file("test.agp") + agp.text = "" + + input[0] = [ + [ id:'test' ], + 'features', + manifest, + agp + ] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap new file mode 100644 index 0000000..397c8f0 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -0,0 +1,166 @@ +{ + "Stub outputs: repeat AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" + ] + ], + "1": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" + ] + ], + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:29.784572", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: ncRNA seq_region mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" + ] + ], + "1": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" + ] + ], + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:27.607529", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: ncRNA AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" + ] + ], + "1": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" + ] + ], + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:31.963829", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: repeat seq_region mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" + ] + ], + "1": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" + ] + ], + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.3" + ] + ] + } + ], + "timestamp": "2026-05-14T14:39:25.403423", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + } +} \ No newline at end of file