From d830e164c56ce6093f57aaba35ffb962f08da244 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:18:42 +0000 Subject: [PATCH 01/36] Add python script for splitting FASTA, chunking if necessary --- .../ensembl/fasta/splitfasta/split_fasta.py | 462 ++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 modules/ensembl/fasta/splitfasta/split_fasta.py diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py new file mode 100644 index 0000000..164ec44 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/split_fasta.py @@ -0,0 +1,462 @@ +#!/usr/bin/env python3 + +"""Split a FASTA file (possibly gzipped) into multiple smaller FASTA files.""" + +import inspect +import logging +import shutil +from pathlib import Path +from typing import Optional, List, Set, Tuple + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +try: + from ensembl.utils.archive import open_gz_file # type: ignore +except ImportError: + import gzip + + def open_gz_file(path): + p = str(path) + return gzip.open(p, "rt") if p.endswith(".gz") else open(p, "rt") + + +try: + from ensembl.utils.argparse import ArgumentParser # type: ignore +except ImportError: + from argparse import ArgumentParser + +try: + from ensembl.utils.logging import init_logging_with_args # type: ignore +except ImportError: + import logging + + def init_logging_with_args(args): + level = getattr(args, "log_level", "INFO") + logging.basicConfig(level=level) + + +class Params: + """Class to hold parameters for splitting FASTA files.""" + + def __init__( + self, + fasta_file: Path, + out_dir: Optional[Path] = None, + write_agp: bool = False, + max_seqs_per_file: Optional[int] = None, + max_seq_length_per_file: Optional[int] = None, + min_chunk_length: Optional[int] = None, + max_files_per_directory: Optional[int] = None, + max_dirs_per_directory: Optional[int] = None, + delete_existing_files: bool = False, + unique_file_names: bool = False, + delete_original_file: bool = False, + force_max_seq_length: bool = False, + ): + self.fasta_file = fasta_file + self.out_dir = out_dir if out_dir is not None else fasta_file.parent + self.write_agp = write_agp + self.max_seqs_per_file = max_seqs_per_file + self.max_seq_length_per_file = max_seq_length_per_file + self.min_chunk_length = min_chunk_length + self.max_files_per_directory = max_files_per_directory + self.max_dirs_per_directory = max_dirs_per_directory + self.delete_existing_files = delete_existing_files + self.unique_file_names = unique_file_names + self.delete_original_file = delete_original_file + self.force_max_seq_length = force_max_seq_length + + self._validate_params() + + def _validate_params(self) -> None: + if self.max_dirs_per_directory is not None and self.max_dirs_per_directory <= 0: + raise ValueError("--max-dirs-per-directory must be > 0 or None") + if ( + self.max_files_per_directory is not None + and self.max_files_per_directory <= 0 + ): + raise ValueError("--max-files-per-directory must be > 0 or None") + if self.max_seqs_per_file is not None and self.max_seqs_per_file <= 0: + raise ValueError("--max-seqs-per-file must be > 0 or None") + if ( + self.max_seq_length_per_file is not None + and self.max_seq_length_per_file <= 0 + ): + raise ValueError("--max-seq-length-per-file must be > 0 or None") + if self.min_chunk_length is not None: + if self.max_seq_length_per_file is None: + raise ValueError( + "--min-chunk-length requires --max-seq-length-per-file" + ) + if self.min_chunk_length <= 0: + raise ValueError("--min-chunk-length must be > 0") + + +class OutputWriter: + """ + Manages output file creation and counters, writing in a single pass. + Creates/cleans directories lazily as required. + """ + + def __init__(self, params: Params): + self.params = params + self.basename = ( + params.fasta_file.name.removesuffix(".gz") + .removesuffix(".fa") + .removesuffix(".fasta") + ) + self.agp_file = ( + self.params.out_dir.joinpath(self.basename + ".agp") + if params.write_agp + else None + ) + self.file_count = 0 + self.record_count = 0 + self.file_len = 0 + self._fh = None + self._agp_fh = None + self._cleaned_dirs: Set[Path] = set() + + self.open_new_file() + + def _create_or_clean_dir(self, dir_path: Path) -> None: + try: + dir_path.mkdir(parents=True, exist_ok=True) + if self.params.delete_existing_files and dir_path not in self._cleaned_dirs: + for child in dir_path.iterdir(): + if child.is_dir(): + shutil.rmtree(child) + else: + child.unlink() + self._cleaned_dirs.add(dir_path) + except Exception: + logging.exception("Failed to prepare output directory '%s'", dir_path) + raise + + def _get_subdir_path(self, dir_index: int) -> Path: + """Computes subdirectory path based on dir_index and max_dirs_per_directory.""" + parts = [] + max_dirs = self.params.max_dirs_per_directory + if max_dirs is None: + parts.append("1") + else: + current_index = dir_index + while current_index >= 0: + parts.append(f"{current_index % max_dirs}") + current_index = current_index // max_dirs - 1 + + parts.reverse() + return self.params.out_dir.joinpath(*parts) + + def _get_file_and_dir_index(self) -> Tuple[int, int]: + """ + Determines index of file and directory based on file count and max files per directory. + Returns (file_index, dir_index). + """ + max_files = self.params.max_files_per_directory + if max_files is None: + return self.file_count, 0 + adjusted_count = self.file_count - 1 + return (adjusted_count % max_files + 1, adjusted_count // max_files) + + def _get_path_for_next_file(self) -> Path: + """Computes path for the next output file.""" + self.file_count += 1 + file_index, dir_index = self._get_file_and_dir_index() + subdir_path = self._get_subdir_path(dir_index) + self._create_or_clean_dir(subdir_path) + + if self.params.unique_file_names: + file_name = f"{self.basename}.{dir_index}.{file_index}.fa" + else: + file_name = f"{self.basename}.{file_index}.fa" + return subdir_path.joinpath(file_name) + + def add_agp_entry( + self, + object_id: str, + start: int, + end: int, + part_nr: int, + part_id: str, + part_length: int, + ) -> None: + """Adds an entry to the AGP file.""" + # AGP columns for WGS contig component type: + # object, object_beg, object_end, part_number, component_type, + # component_id, component_beg, component_end, orientation + if self._agp_fh is None: + return + try: + line = f"{object_id}\t{start}\t{end}\t{part_nr}\tW\t{part_id}\t1\t{part_length}\t+\n" + self._agp_fh.write(line) + except Exception: + logging.exception("Failed to write AGP entry for part '%s'", part_id) + raise + + def create_agp_file(self) -> None: + """Creates the AGP file for recording sequence chunking.""" + if self.agp_file is None: + return + try: + self.params.out_dir.mkdir(parents=True, exist_ok=True) + self._agp_fh = open(self.agp_file, "w") + self._agp_fh.write("# AGP-version 2.0\n") + logging.info("Created AGP file '%s'", self.agp_file) + except Exception: + logging.exception("Failed to open AGP file '%s'", self.agp_file) + raise + + def open_new_file(self) -> None: + """Closes current file (if any) and opens a new output file.""" + if self._fh is not None: + self._fh.close() + + path = self._get_path_for_next_file() + try: + self._fh = open(path, "w") + logging.debug("Opened output file '%s'", path) + except Exception: + logging.exception("Failed to open output file '%s'", path) + raise + self.record_count = 0 + self.file_len = 0 + + def write_record(self, record: SeqRecord) -> None: + """Writes a SeqRecord to the current output file.""" + try: + SeqIO.write(record, self._fh, "fasta") + self.record_count += 1 + self.file_len += len(record.seq) + except Exception: + logging.exception("Failed to write record '%s' to output file", record.id) + raise + + def close(self) -> None: + if self._fh is not None: + self._fh.close() + self._fh = None + if self._agp_fh is not None: + self._agp_fh.close() + self._agp_fh = None + + +def _get_param_defaults() -> dict: + """Retrieve default values for Params class attributes.""" + signature = inspect.signature(Params.__init__) + defaults = {} + for name, param in signature.parameters.items(): + if name != "self" and param.default is not inspect.Parameter.empty: + defaults[name] = param.default + return defaults + + +def split_fasta(params: Params) -> None: + """Splits the input FASTA file into multiple smaller FASTA files, chunking long sequences if required.""" + if not params.fasta_file.exists(): + logging.error( + "DEBUG: fasta_file=%r resolved=%r cwd=%r", + str(params.fasta_file), + str(Path(params.fasta_file).resolve()), + str(Path.cwd()), + ) + raise FileNotFoundError(f"Fasta file '{params.fasta_file}' does not exist") + + # Do nothing if file size is 0 + if params.fasta_file.stat().st_size == 0: + logging.info("Input FASTA '%s' is empty; nothing to do", params.fasta_file) + return + + params.out_dir.mkdir(parents=True, exist_ok=True) + + writer = OutputWriter(params) + + try: + if params.write_agp: + writer.create_agp_file() + + with open_gz_file(params.fasta_file) as fh: + for record in SeqIO.parse(fh, "fasta"): + seq_len = len(record.seq) + max_seq_len = params.max_seq_length_per_file + max_seqs = params.max_seqs_per_file + + if max_seqs is not None and writer.record_count >= max_seqs: + writer.open_new_file() + + if max_seq_len is None or writer.file_len + seq_len <= max_seq_len: + writer.write_record(record) + if params.write_agp: + writer.add_agp_entry( + record.id, 1, seq_len, 1, record.id, seq_len + ) + continue + + if params.force_max_seq_length and seq_len > max_seq_len: + starts = list(range(0, seq_len, max_seq_len)) + ends = [min(s + max_seq_len, seq_len) for s in starts] + + if params.min_chunk_length is not None and len(starts) > 1: + last_chunk_len = ends[-1] - starts[-1] + if last_chunk_len < params.min_chunk_length: + logging.warning( + "Length of last chunk of record '%s' is %d, lower than min_chunk_length: %d;" + + "merging with previous chunk", + record.id, + last_chunk_len, + params.min_chunk_length, + ) + ends[-2] = seq_len + starts.pop() + ends.pop() + + for i, (start, end) in enumerate(zip(starts, ends), start=1): + chunk_seq = record.seq[start:end] + chunk_record = SeqRecord( + chunk_seq, + id=f"{record.id}_chunk_start_{start}", + description=f"{record.description} (part {i})", + ) + if writer.record_count > 0: + writer.open_new_file() + writer.write_record(chunk_record) + + if params.write_agp: + writer.add_agp_entry( + record.id, + start + 1, + end, + i, + chunk_record.id, + len(chunk_seq), + ) + else: + logging.warning( + "Record '%s' length %d exceeds max_seq_length_per_file %d but chunking not enabled", + record.id, + seq_len, + max_seq_len, + ) + if writer.record_count > 0: + writer.open_new_file() + writer.write_record(record) + if params.write_agp: + writer.add_agp_entry( + record.id, 1, seq_len, 1, record.id, seq_len + ) + except Exception: + logging.exception("Error processing FASTA file '%s'", params.fasta_file) + raise + finally: + writer.close() + + if params.delete_original_file: + try: + params.fasta_file.unlink(missing_ok=True) + except Exception: + logging.warning( + "Failed to delete original FASTA file '%s'", + params.fasta_file, + exc_info=True, + ) + + +def parse_args(argv: Optional[List[str]] = None) -> Params: + defaults = _get_param_defaults() + parser = ArgumentParser( + description="Split a FASTA file into multiple FASTA files, optionally chunking long sequences." + ) + parser.add_argument( + "--fasta-file", + type=Path, + required=True, + help="Input raw or compressed FASTA file containing sequences to split", + ) + parser.add_argument( + "--out-dir", + type=Path, + help="Top-level output directory (default: input FASTA directory)", + ) + parser.add_argument( + "--write-agp", + action="store_true", + help=f"Write AGP file describing the splits (default: {defaults['write_agp']})", + ) + parser.add_argument( + "--max-seqs-per-file", + type=int, + help=f"Max records per output file (default: {defaults['max_seqs_per_file']})", + ) + parser.add_argument( + "--max-seq-length-per-file", + type=int, + help=f"Max cumulative sequence length per output file (default: {defaults['max_seq_length_per_file']})", + ) + parser.add_argument( + "--min-chunk-length", + type=int, + help=f"Minimum length of a chunk allowed as a remainder (default: {defaults['min_chunk_length']})", + ) + parser.add_argument( + "--max-files-per-directory", + type=int, + help=f"Max files per directory before moving to next computed dir (default: {defaults['max_files_per_directory']})", + ) + parser.add_argument( + "--max-dirs-per-directory", + type=int, + help=f"Max subdirectories per directory level (default: {defaults['max_dirs_per_directory']})", + ) + parser.add_argument( + "--delete-existing-files", + action="store_true", + help=f"Delete existing files within computed output dirs (default: {defaults['delete_existing_files']})", + ) + parser.add_argument( + "--unique-file-names", + action="store_true", + help=f"Make output file names unique across dirs by including dir_index (default: {defaults['unique_file_names']})", + ) + parser.add_argument( + "--delete-original-file", + action="store_true", + help=f"Delete original input FASTA after splitting (default: {defaults['delete_original_file']})", + ) + parser.add_argument( + "--force-max-seq-length", + action="store_true", + help=f"Chunk single sequences longer than max-seq-length-per-file (default: {defaults['force_max_seq_length']})", + ) + + args = parser.parse_args(argv) + init_logging_with_args(args) + + params = Params( + fasta_file=args.fasta_file, + out_dir=args.out_dir, + write_agp=args.write_agp, + max_seqs_per_file=args.max_seqs_per_file, + max_seq_length_per_file=args.max_seq_length_per_file, + min_chunk_length=args.min_chunk_length, + max_files_per_directory=args.max_files_per_directory, + max_dirs_per_directory=args.max_dirs_per_directory, + delete_existing_files=args.delete_existing_files, + unique_file_names=args.unique_file_names, + delete_original_file=args.delete_original_file, + force_max_seq_length=args.force_max_seq_length, + ) + return params + + +def main(argv: Optional[List[str]] = None) -> None: + try: + params = parse_args(argv) + split_fasta(params) + except Exception: + logging.exception("Error processing FASTA file '%s'", params.fasta_file) + raise + + +if __name__ == "__main__": + main() From f89d0b21e9185451f00b4c95143f889cadfb6126 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:19:44 +0000 Subject: [PATCH 02/36] Add pytest tests for split_fasta.py --- requirements-dev.txt | 2 + tests/conftest.py | 24 +++++++ tests/test_split_fasta.py | 144 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 requirements-dev.txt create mode 100644 tests/conftest.py create mode 100644 tests/test_split_fasta.py diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c0367d2 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +biopython +pytest \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..766dbc3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,24 @@ +import importlib.util +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="session") +def split_fasta_module(): + """ + Load modules/ensembl/fasta/splitfasta/split_fasta.py as a Python module + regardless of whether 'modules/' is a Python package. + """ + repo_root = Path(__file__).resolve().parents[1] + module_path = ( + repo_root / "modules" / "ensembl" / "fasta" / "splitfasta" / "split_fasta.py" + ) + + spec = importlib.util.spec_from_file_location("split_fasta", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load module spec from {module_path}") + + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod diff --git a/tests/test_split_fasta.py b/tests/test_split_fasta.py new file mode 100644 index 0000000..8a48af2 --- /dev/null +++ b/tests/test_split_fasta.py @@ -0,0 +1,144 @@ +# tests/test_split_fasta.py +from pathlib import Path + +import pytest +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +def write_fasta(path: Path, records): + with open(path, "w", encoding="utf-8", newline="\n") as fh: + SeqIO.write(records, fh, "fasta") + + +def list_output_fastas(out_dir: Path): + return sorted(out_dir.rglob("*.fa")) + + +def read_all_ids_from_fastas(out_dir: Path): + ids = [] + for fa in list_output_fastas(out_dir): + with open(fa, "r", encoding="utf-8") as fh: + ids.extend([r.id for r in SeqIO.parse(fh, "fasta")]) + return ids + + +def parse_agp_lines(agp_path: Path): + lines = [l.rstrip("\n") for l in agp_path.read_text(encoding="utf-8").splitlines()] + lines = [l for l in lines if l and not l.startswith("#")] + return [l.split("\t") for l in lines] + + +def test_no_agp_by_default(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + write_fasta(inp, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=False, + ) + split_fasta_module.split_fasta(params) + + assert not (out / "in.agp").exists() + assert len(list_output_fastas(out)) >= 1 + + +def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + recs = [ + SeqRecord(Seq("A" * 10), id="s1", description=""), + SeqRecord(Seq("C" * 10), id="s2", description=""), + SeqRecord(Seq("G" * 10), id="s3", description=""), + ] + write_fasta(inp, recs) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + max_seqs_per_file=2, + write_agp=False, + ) + split_fasta_module.split_fasta(params) + + fas = list_output_fastas(out) + assert len(fas) == 2 + assert read_all_ids_from_fastas(out) == ["s1", "s2", "s3"] + + +def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_module): + """ + seq_len=2100, max=1000 -> chunks [1000, 1000, 100] + min_chunk_length=200 -> final chunk merged -> [1000, 1100] + """ + inp = tmp_path / "in.fa" + out = tmp_path / "out" + write_fasta(inp, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")]) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=True, + force_max_seq_length=True, + max_seq_length_per_file=1000, + min_chunk_length=200, + max_seqs_per_file=100000, # avoid seq-count splitting interfering + ) + split_fasta_module.split_fasta(params) + + # 2 chunks expected after merge + assert read_all_ids_from_fastas(out) == [ + "chr1_chunk_start_0", + "chr1_chunk_start_1000", + ] + + agp = out / "in.agp" + assert agp.exists() + + cols = parse_agp_lines(agp) + assert len(cols) == 2 + + # object, obj_beg, obj_end, part_no, type, comp_id, comp_beg, comp_end, orient + assert cols[0][0] == "chr1" + assert cols[0][1:4] == ["1", "1000", "1"] + assert cols[0][4] == "W" + assert cols[0][5] == "chr1_chunk_start_0" + assert cols[0][6:9] == ["1", "1000", "+"] + + assert cols[1][0] == "chr1" + assert cols[1][1:4] == ["1001", "2100", "2"] + assert cols[1][4] == "W" + assert cols[1][5] == "chr1_chunk_start_1000" + assert cols[1][6:9] == ["1", "1100", "+"] + + +def test_agp_part_numbers_restart_per_object(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + recs = [ + SeqRecord(Seq("A" * 1200), id="obj1", description=""), + SeqRecord(Seq("C" * 1200), id="obj2", description=""), + ] + write_fasta(inp, recs) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=True, + force_max_seq_length=True, + max_seq_length_per_file=1000, + min_chunk_length=100, # => 2 chunks each, no merge + ) + split_fasta_module.split_fasta(params) + + cols = parse_agp_lines(out / "in.agp") + + by_obj = {} + for c in cols: + by_obj.setdefault(c[0], []).append(int(c[3])) + + assert by_obj["obj1"] == [1, 2] + assert by_obj["obj2"] == [1, 2] From cfe0b4e479f0a92e552f6e73fc6f69d1204e7168 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:20:36 +0000 Subject: [PATCH 03/36] Add nextflow module and tests for running split_fasta.py --- .gitignore | 3 + .../ensembl/fasta/splitfasta/environment.yml | 8 + modules/ensembl/fasta/splitfasta/main.nf | 106 ++++++ .../fasta/splitfasta/tests/data/agp/test.agp | 4 + .../fasta/splitfasta/tests/data/real/in.fa | 6 + .../tests/data/splits/default/0/test.1.fa | 4 + .../tests/data/splits/default/0/test.2.fa | 2 + .../tests/data/splits/multi_dir/0/0/test.1.fa | 2 + .../tests/data/splits/multi_dir/0/1/test.2.fa | 2 + .../tests/data/splits/unique/0/test.0.1.fa | 2 + .../tests/data/splits/unique/0/test.0.2.fa | 2 + .../fasta/splitfasta/tests/main.nf.test | 301 ++++++++++++++++++ .../fasta/splitfasta/tests/main.nf.test.snap | 168 ++++++++++ 13 files changed, 610 insertions(+) create mode 100644 modules/ensembl/fasta/splitfasta/environment.yml create mode 100644 modules/ensembl/fasta/splitfasta/main.nf create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/real/in.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test create mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap diff --git a/.gitignore b/.gitignore index e75900d..961b31c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ .nextflow* .nf-test* +__pycache__/ +*.pyc +.python-version \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/environment.yml b/modules/ensembl/fasta/splitfasta/environment.yml new file mode 100644 index 0000000..759f3da --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/environment.yml @@ -0,0 +1,8 @@ +--- +name: "fasta_splitfasta" +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11.7 + - biopython=1.86 \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf new file mode 100644 index 0000000..0a8b761 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/main.nf @@ -0,0 +1,106 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FASTA_SPLITFASTA { + + tag "${meta.id}" + label 'process_low' + + publishDir "${params.outdir ?: '.'}", mode: 'copy' + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("**/*.fa"), emit: fasta + tuple val(meta), path("*.agp"), emit: agp, optional: true + + script: + def args = [] + + if (params.max_seqs_per_file) { + args << "--max-seqs-per-file ${params.max_seqs_per_file}" + } + + if (params.max_seq_length_per_file) { + args << "--max-seq-length-per-file ${params.max_seq_length_per_file}" + } + + if (params.min_chunk_length) { + args << "--min-chunk-length ${params.min_chunk_length}" + } + + if (params.max_files_per_directory) { + args << "--max-files-per-directory ${params.max_files_per_directory}" + } + + if (params.max_dirs_per_directory) { + args << "--max-dirs-per-directory ${params.max_dirs_per_directory}" + } + + if (params.force_max_seq_length) { + args << "--force-max-seq-length" + } + + if (params.write_agp) { + args << "--write-agp" + } + + if (params.unique_file_names) { + args << "--unique-file-names" + } + + if (params.delete_existing_files) { + args << "--delete-existing-files" + } + + """ + python \\ + ${moduleDir}/split_fasta.py \\ + --fasta-file \$PWD/${fasta} \\ + --out-dir \$PWD \\ + ${args.join(' ')} + """ + + stub: + """ + set -euo pipefail + + FIXTURE_DIR="${moduleDir}/tests/data" + + LAYOUT="default" + if [[ "${params.unique_file_names ?: false}" == "true" ]]; then + LAYOUT="unique" + elif [[ -n "${params.max_dirs_per_directory ?: ''}" || -n "${params.max_files_per_directory ?: ''}" ]]; then + LAYOUT="multi_dir" + fi + + mkdir -p splits + cp -R "\$FIXTURE_DIR/splits/\$LAYOUT/." "splits/" + + find splits -type f -name 'test*.fa' | while read -r f; do + bn=\$(basename "\$f") + dir=\$(dirname "\$f") + new_bn="\${bn/test/${meta.id}}" + mv "\$f" "\${dir}/\${new_bn}" + done + + if [[ "${params.write_agp ?: false}" == "true" ]]; then + cp "\$FIXTURE_DIR/agp/test.agp" "${meta.id}.agp" + fi + """ + + +} diff --git a/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp b/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp new file mode 100644 index 0000000..46fc419 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp @@ -0,0 +1,4 @@ +# AGP-version 2.0 +seq1 1 10 1 W seq1 1 10 + +seq2 1 10 1 W seq2 1 10 + +seq3 1 11 1 W seq3 1 11 + \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa b/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa new file mode 100644 index 0000000..3d3f65c --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa @@ -0,0 +1,6 @@ +>seq1 +AAAAAAAAAA +>seq2 +CCCCCCCCCC +>seq3 +GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa new file mode 100644 index 0000000..7abe938 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa @@ -0,0 +1,4 @@ +>seq1 +AAAAAAAAAA +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa new file mode 100644 index 0000000..6287efa --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa @@ -0,0 +1,2 @@ +>seq3 +GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa new file mode 100644 index 0000000..9512f36 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa @@ -0,0 +1,2 @@ +>seq1 +AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa new file mode 100644 index 0000000..2f3b40f --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa @@ -0,0 +1,2 @@ +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa new file mode 100644 index 0000000..9512f36 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa @@ -0,0 +1,2 @@ +>seq1 +AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa new file mode 100644 index 0000000..2f3b40f --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa @@ -0,0 +1,2 @@ +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test b/modules/ensembl/fasta/splitfasta/tests/main.nf.test new file mode 100644 index 0000000..3db1283 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test @@ -0,0 +1,301 @@ +// nf-core modules test fasta/splitfasta +nextflow_process { + + name "Test Process FASTA_SPLITFASTA" + script "../main.nf" + process "FASTA_SPLITFASTA" + + tag "modules" + tag "modules_ensembl" + tag "fasta" + tag "fasta/splitfasta" + + + def real_fa = new File("modules/ensembl/fasta/splitfasta/tests/data/real/in.fa").canonicalFile + + test("Stub outputs: default layout, no AGP") { + + when { + options "-stub" + + // Ensure params are set explicitly for this test + params.write_agp = false + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + // fasta: tuple(meta, fa_paths) + assert process.out.fasta != null + assert process.out.fasta.size() == 1 + + def fasta_out = process.out.fasta[0] + def meta = fasta_out[0] + def fas = fasta_out[1] + + assert meta.id == "test" + assert fas != null + assert fas.size() == 2 + + // agp: tuple(meta, agp_paths) optional -> should be absent + assert process.out.agp != null + assert process.out.agp.size() == 0 + + // Ensure FASTA parsing works (downstream contract) + def merged = fas + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: AGP optional output appears when enabled") { + + when { + options "-stub" + + params.write_agp = true + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + assert process.out.fasta.size() == 1 + def fasta_out = process.out.fasta[0] + def fas = fasta_out[1] + assert fas.size() == 2 + + assert process.out.agp.size() == 1 + def agp_out = process.out.agp[0] + def agp_meta = agp_out[0] + def agp = agp_out[1] + def agp_paths = agp instanceof List ? agp : [agp] + def agp_file = path(agp_paths[0]).toFile() + + assert agp_meta.id == "test" + assert agp_paths.size() == 1 + assert agp_file.name == "test.agp" + + def agp_text = agp_file.text + assert agp_text.startsWith("# AGP-version 2.0") + assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") + assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") + assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: unique_file_names contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = true + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fasta_out = process.out.fasta[0] + def fas = fasta_out[1] + + assert fas.size() == 2 + assert process.out.agp.size() == 0 + + // Contract check: names match the unique fixture pattern + assert fas.collect { path(it).toFile().name }.sort() == ["test.0.1.fa", "test.0.2.fa"] + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: nested directory layout contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = false + + // Trigger stub's nested fixture selection + params.max_files_per_directory = 100 + params.max_dirs_per_directory = 100 + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fastas = process.out.fasta[0][1] + assert fastas.size() == 2 + assert process.out.agp.size() == 0 + + def rels = fastas.collect { path(it).toString() } + assert rels.any { it.contains("splits/0/0/") } + assert rels.any { it.contains("splits/0/1/") } + + assertAll( + { assert process.success } + ) + } + } + + test("Real run: default behaviour produces FASTAs and no AGP") { + + when { + params.write_agp = false + params.unique_file_names = false + params.max_seqs_per_file = null + params.max_seq_length_per_file = null + params.max_files_per_directory = null + params.max_dirs_per_directory = null + params.force_max_seq_length = false + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + assert process.out.fasta != null + assert process.out.fasta.size() == 1 + + def out = process.out.fasta[0] + def meta = out[0] + def fas = out[1] + + assert meta.id == "test" + def fas_list = (fas instanceof List) ? fas : [fas] + assert fas_list.size() >= 1 + + assert process.out.agp != null + assert process.out.agp.size() == 0 + + def merged = fas_list + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + } + } + + test("Real run: write_agp=true emits exactly one AGP file") { + + when { + params.write_agp = true + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + params.max_seqs_per_file = null + params.max_seq_length_per_file = null + params.force_max_seq_length = false + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + assert process.out.agp != null + assert process.out.agp.size() == 1 + + def agp_out = process.out.agp[0] + def agp_meta = agp_out[0] + def agp_val = agp_out[1] + + assert agp_meta.id == "test" + + def agp_list = (agp_val instanceof List) ? agp_val : [agp_val] + assert agp_list.size() == 1 + + def agp_path = path(agp_list[0]) + assert agp_path.fileName.toString().endsWith(".agp") + + def agp_text = agp_path.toFile().text + assert agp_text.startsWith("# AGP-version 2.0") + assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") + assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") + assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") + } + } + + test("Real run: max_seqs_per_file=2 splits into 2 FASTA outputs") { + + when { + params.write_agp = false + params.max_seqs_per_file = 2 + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + def fas = process.out.fasta[0][1] + assert fas.size() == 2 + + def merged = fas + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + } + } +} diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap new file mode 100644 index 0000000..3390583 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap @@ -0,0 +1,168 @@ +{ + "Stub outputs: AGP optional output appears when enabled": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] + ], + "agp": [ + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:07.606463" + }, + "Stub outputs: nested directory layout contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:11.815126" + }, + "Stub outputs: default layout, no AGP": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:05.482323" + }, + "Stub outputs: unique_file_names contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:09.698407" + } +} \ No newline at end of file From acfe54f1ea92e21e918152fc55cea025757cfe79 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:28:51 +0000 Subject: [PATCH 04/36] Add python script for splitting FASTA, chunking if necessary --- .../ensembl/fasta/splitfasta/split_fasta.py | 462 ++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 modules/ensembl/fasta/splitfasta/split_fasta.py diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py new file mode 100644 index 0000000..164ec44 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/split_fasta.py @@ -0,0 +1,462 @@ +#!/usr/bin/env python3 + +"""Split a FASTA file (possibly gzipped) into multiple smaller FASTA files.""" + +import inspect +import logging +import shutil +from pathlib import Path +from typing import Optional, List, Set, Tuple + +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord + +try: + from ensembl.utils.archive import open_gz_file # type: ignore +except ImportError: + import gzip + + def open_gz_file(path): + p = str(path) + return gzip.open(p, "rt") if p.endswith(".gz") else open(p, "rt") + + +try: + from ensembl.utils.argparse import ArgumentParser # type: ignore +except ImportError: + from argparse import ArgumentParser + +try: + from ensembl.utils.logging import init_logging_with_args # type: ignore +except ImportError: + import logging + + def init_logging_with_args(args): + level = getattr(args, "log_level", "INFO") + logging.basicConfig(level=level) + + +class Params: + """Class to hold parameters for splitting FASTA files.""" + + def __init__( + self, + fasta_file: Path, + out_dir: Optional[Path] = None, + write_agp: bool = False, + max_seqs_per_file: Optional[int] = None, + max_seq_length_per_file: Optional[int] = None, + min_chunk_length: Optional[int] = None, + max_files_per_directory: Optional[int] = None, + max_dirs_per_directory: Optional[int] = None, + delete_existing_files: bool = False, + unique_file_names: bool = False, + delete_original_file: bool = False, + force_max_seq_length: bool = False, + ): + self.fasta_file = fasta_file + self.out_dir = out_dir if out_dir is not None else fasta_file.parent + self.write_agp = write_agp + self.max_seqs_per_file = max_seqs_per_file + self.max_seq_length_per_file = max_seq_length_per_file + self.min_chunk_length = min_chunk_length + self.max_files_per_directory = max_files_per_directory + self.max_dirs_per_directory = max_dirs_per_directory + self.delete_existing_files = delete_existing_files + self.unique_file_names = unique_file_names + self.delete_original_file = delete_original_file + self.force_max_seq_length = force_max_seq_length + + self._validate_params() + + def _validate_params(self) -> None: + if self.max_dirs_per_directory is not None and self.max_dirs_per_directory <= 0: + raise ValueError("--max-dirs-per-directory must be > 0 or None") + if ( + self.max_files_per_directory is not None + and self.max_files_per_directory <= 0 + ): + raise ValueError("--max-files-per-directory must be > 0 or None") + if self.max_seqs_per_file is not None and self.max_seqs_per_file <= 0: + raise ValueError("--max-seqs-per-file must be > 0 or None") + if ( + self.max_seq_length_per_file is not None + and self.max_seq_length_per_file <= 0 + ): + raise ValueError("--max-seq-length-per-file must be > 0 or None") + if self.min_chunk_length is not None: + if self.max_seq_length_per_file is None: + raise ValueError( + "--min-chunk-length requires --max-seq-length-per-file" + ) + if self.min_chunk_length <= 0: + raise ValueError("--min-chunk-length must be > 0") + + +class OutputWriter: + """ + Manages output file creation and counters, writing in a single pass. + Creates/cleans directories lazily as required. + """ + + def __init__(self, params: Params): + self.params = params + self.basename = ( + params.fasta_file.name.removesuffix(".gz") + .removesuffix(".fa") + .removesuffix(".fasta") + ) + self.agp_file = ( + self.params.out_dir.joinpath(self.basename + ".agp") + if params.write_agp + else None + ) + self.file_count = 0 + self.record_count = 0 + self.file_len = 0 + self._fh = None + self._agp_fh = None + self._cleaned_dirs: Set[Path] = set() + + self.open_new_file() + + def _create_or_clean_dir(self, dir_path: Path) -> None: + try: + dir_path.mkdir(parents=True, exist_ok=True) + if self.params.delete_existing_files and dir_path not in self._cleaned_dirs: + for child in dir_path.iterdir(): + if child.is_dir(): + shutil.rmtree(child) + else: + child.unlink() + self._cleaned_dirs.add(dir_path) + except Exception: + logging.exception("Failed to prepare output directory '%s'", dir_path) + raise + + def _get_subdir_path(self, dir_index: int) -> Path: + """Computes subdirectory path based on dir_index and max_dirs_per_directory.""" + parts = [] + max_dirs = self.params.max_dirs_per_directory + if max_dirs is None: + parts.append("1") + else: + current_index = dir_index + while current_index >= 0: + parts.append(f"{current_index % max_dirs}") + current_index = current_index // max_dirs - 1 + + parts.reverse() + return self.params.out_dir.joinpath(*parts) + + def _get_file_and_dir_index(self) -> Tuple[int, int]: + """ + Determines index of file and directory based on file count and max files per directory. + Returns (file_index, dir_index). + """ + max_files = self.params.max_files_per_directory + if max_files is None: + return self.file_count, 0 + adjusted_count = self.file_count - 1 + return (adjusted_count % max_files + 1, adjusted_count // max_files) + + def _get_path_for_next_file(self) -> Path: + """Computes path for the next output file.""" + self.file_count += 1 + file_index, dir_index = self._get_file_and_dir_index() + subdir_path = self._get_subdir_path(dir_index) + self._create_or_clean_dir(subdir_path) + + if self.params.unique_file_names: + file_name = f"{self.basename}.{dir_index}.{file_index}.fa" + else: + file_name = f"{self.basename}.{file_index}.fa" + return subdir_path.joinpath(file_name) + + def add_agp_entry( + self, + object_id: str, + start: int, + end: int, + part_nr: int, + part_id: str, + part_length: int, + ) -> None: + """Adds an entry to the AGP file.""" + # AGP columns for WGS contig component type: + # object, object_beg, object_end, part_number, component_type, + # component_id, component_beg, component_end, orientation + if self._agp_fh is None: + return + try: + line = f"{object_id}\t{start}\t{end}\t{part_nr}\tW\t{part_id}\t1\t{part_length}\t+\n" + self._agp_fh.write(line) + except Exception: + logging.exception("Failed to write AGP entry for part '%s'", part_id) + raise + + def create_agp_file(self) -> None: + """Creates the AGP file for recording sequence chunking.""" + if self.agp_file is None: + return + try: + self.params.out_dir.mkdir(parents=True, exist_ok=True) + self._agp_fh = open(self.agp_file, "w") + self._agp_fh.write("# AGP-version 2.0\n") + logging.info("Created AGP file '%s'", self.agp_file) + except Exception: + logging.exception("Failed to open AGP file '%s'", self.agp_file) + raise + + def open_new_file(self) -> None: + """Closes current file (if any) and opens a new output file.""" + if self._fh is not None: + self._fh.close() + + path = self._get_path_for_next_file() + try: + self._fh = open(path, "w") + logging.debug("Opened output file '%s'", path) + except Exception: + logging.exception("Failed to open output file '%s'", path) + raise + self.record_count = 0 + self.file_len = 0 + + def write_record(self, record: SeqRecord) -> None: + """Writes a SeqRecord to the current output file.""" + try: + SeqIO.write(record, self._fh, "fasta") + self.record_count += 1 + self.file_len += len(record.seq) + except Exception: + logging.exception("Failed to write record '%s' to output file", record.id) + raise + + def close(self) -> None: + if self._fh is not None: + self._fh.close() + self._fh = None + if self._agp_fh is not None: + self._agp_fh.close() + self._agp_fh = None + + +def _get_param_defaults() -> dict: + """Retrieve default values for Params class attributes.""" + signature = inspect.signature(Params.__init__) + defaults = {} + for name, param in signature.parameters.items(): + if name != "self" and param.default is not inspect.Parameter.empty: + defaults[name] = param.default + return defaults + + +def split_fasta(params: Params) -> None: + """Splits the input FASTA file into multiple smaller FASTA files, chunking long sequences if required.""" + if not params.fasta_file.exists(): + logging.error( + "DEBUG: fasta_file=%r resolved=%r cwd=%r", + str(params.fasta_file), + str(Path(params.fasta_file).resolve()), + str(Path.cwd()), + ) + raise FileNotFoundError(f"Fasta file '{params.fasta_file}' does not exist") + + # Do nothing if file size is 0 + if params.fasta_file.stat().st_size == 0: + logging.info("Input FASTA '%s' is empty; nothing to do", params.fasta_file) + return + + params.out_dir.mkdir(parents=True, exist_ok=True) + + writer = OutputWriter(params) + + try: + if params.write_agp: + writer.create_agp_file() + + with open_gz_file(params.fasta_file) as fh: + for record in SeqIO.parse(fh, "fasta"): + seq_len = len(record.seq) + max_seq_len = params.max_seq_length_per_file + max_seqs = params.max_seqs_per_file + + if max_seqs is not None and writer.record_count >= max_seqs: + writer.open_new_file() + + if max_seq_len is None or writer.file_len + seq_len <= max_seq_len: + writer.write_record(record) + if params.write_agp: + writer.add_agp_entry( + record.id, 1, seq_len, 1, record.id, seq_len + ) + continue + + if params.force_max_seq_length and seq_len > max_seq_len: + starts = list(range(0, seq_len, max_seq_len)) + ends = [min(s + max_seq_len, seq_len) for s in starts] + + if params.min_chunk_length is not None and len(starts) > 1: + last_chunk_len = ends[-1] - starts[-1] + if last_chunk_len < params.min_chunk_length: + logging.warning( + "Length of last chunk of record '%s' is %d, lower than min_chunk_length: %d;" + + "merging with previous chunk", + record.id, + last_chunk_len, + params.min_chunk_length, + ) + ends[-2] = seq_len + starts.pop() + ends.pop() + + for i, (start, end) in enumerate(zip(starts, ends), start=1): + chunk_seq = record.seq[start:end] + chunk_record = SeqRecord( + chunk_seq, + id=f"{record.id}_chunk_start_{start}", + description=f"{record.description} (part {i})", + ) + if writer.record_count > 0: + writer.open_new_file() + writer.write_record(chunk_record) + + if params.write_agp: + writer.add_agp_entry( + record.id, + start + 1, + end, + i, + chunk_record.id, + len(chunk_seq), + ) + else: + logging.warning( + "Record '%s' length %d exceeds max_seq_length_per_file %d but chunking not enabled", + record.id, + seq_len, + max_seq_len, + ) + if writer.record_count > 0: + writer.open_new_file() + writer.write_record(record) + if params.write_agp: + writer.add_agp_entry( + record.id, 1, seq_len, 1, record.id, seq_len + ) + except Exception: + logging.exception("Error processing FASTA file '%s'", params.fasta_file) + raise + finally: + writer.close() + + if params.delete_original_file: + try: + params.fasta_file.unlink(missing_ok=True) + except Exception: + logging.warning( + "Failed to delete original FASTA file '%s'", + params.fasta_file, + exc_info=True, + ) + + +def parse_args(argv: Optional[List[str]] = None) -> Params: + defaults = _get_param_defaults() + parser = ArgumentParser( + description="Split a FASTA file into multiple FASTA files, optionally chunking long sequences." + ) + parser.add_argument( + "--fasta-file", + type=Path, + required=True, + help="Input raw or compressed FASTA file containing sequences to split", + ) + parser.add_argument( + "--out-dir", + type=Path, + help="Top-level output directory (default: input FASTA directory)", + ) + parser.add_argument( + "--write-agp", + action="store_true", + help=f"Write AGP file describing the splits (default: {defaults['write_agp']})", + ) + parser.add_argument( + "--max-seqs-per-file", + type=int, + help=f"Max records per output file (default: {defaults['max_seqs_per_file']})", + ) + parser.add_argument( + "--max-seq-length-per-file", + type=int, + help=f"Max cumulative sequence length per output file (default: {defaults['max_seq_length_per_file']})", + ) + parser.add_argument( + "--min-chunk-length", + type=int, + help=f"Minimum length of a chunk allowed as a remainder (default: {defaults['min_chunk_length']})", + ) + parser.add_argument( + "--max-files-per-directory", + type=int, + help=f"Max files per directory before moving to next computed dir (default: {defaults['max_files_per_directory']})", + ) + parser.add_argument( + "--max-dirs-per-directory", + type=int, + help=f"Max subdirectories per directory level (default: {defaults['max_dirs_per_directory']})", + ) + parser.add_argument( + "--delete-existing-files", + action="store_true", + help=f"Delete existing files within computed output dirs (default: {defaults['delete_existing_files']})", + ) + parser.add_argument( + "--unique-file-names", + action="store_true", + help=f"Make output file names unique across dirs by including dir_index (default: {defaults['unique_file_names']})", + ) + parser.add_argument( + "--delete-original-file", + action="store_true", + help=f"Delete original input FASTA after splitting (default: {defaults['delete_original_file']})", + ) + parser.add_argument( + "--force-max-seq-length", + action="store_true", + help=f"Chunk single sequences longer than max-seq-length-per-file (default: {defaults['force_max_seq_length']})", + ) + + args = parser.parse_args(argv) + init_logging_with_args(args) + + params = Params( + fasta_file=args.fasta_file, + out_dir=args.out_dir, + write_agp=args.write_agp, + max_seqs_per_file=args.max_seqs_per_file, + max_seq_length_per_file=args.max_seq_length_per_file, + min_chunk_length=args.min_chunk_length, + max_files_per_directory=args.max_files_per_directory, + max_dirs_per_directory=args.max_dirs_per_directory, + delete_existing_files=args.delete_existing_files, + unique_file_names=args.unique_file_names, + delete_original_file=args.delete_original_file, + force_max_seq_length=args.force_max_seq_length, + ) + return params + + +def main(argv: Optional[List[str]] = None) -> None: + try: + params = parse_args(argv) + split_fasta(params) + except Exception: + logging.exception("Error processing FASTA file '%s'", params.fasta_file) + raise + + +if __name__ == "__main__": + main() From 8a6adaa760dcd8a067d831d27e9b5e39bf56bf8c Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:29:53 +0000 Subject: [PATCH 05/36] Add pytest tests for split_fasta.py --- requirements-dev.txt | 2 + .../conftest.cpython-311-pytest-9.0.2.pyc | Bin 0 -> 1583 bytes ...t_split_fasta.cpython-311-pytest-9.0.2.pyc | Bin 0 -> 23708 bytes tests/conftest.py | 24 +++ tests/test_split_fasta.py | 144 ++++++++++++++++++ 5 files changed, 170 insertions(+) create mode 100644 requirements-dev.txt create mode 100644 tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc create mode 100644 tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc create mode 100644 tests/conftest.py create mode 100644 tests/test_split_fasta.py diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..c0367d2 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +biopython +pytest \ No newline at end of file diff --git a/tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..187575a0d07bcbd9727705c0be55b2143b47d56e GIT binary patch literal 1583 zcmZ`&&1)M+6rcT&cC{-ja*CU#YRVK!QWdJL$+d(M*AzES2rY)9%d*yvqIFiY)U{5r32AX7<1#hes&`RP(O%87OC;3eK&v#MIjkZ7_`>LcE~G z85vRwLR1@TaG>&6#CwLh<%fWd?E^r0Ug?_keK2*CN`RN5*+{Fvv%feN+65h+>kiM$ zRbr~fG!xUTpt};)$Kqfklj4JHil)D=){zc*G)LozcIA%zmDZ6rv{bHpp3bM3`^|Nv zAHa|DY(`)IpV!<(bAmcAY40l2{tn{Fm`Eq6qs)4gKu6)-GIqW(AYZG2d63R{?#^YF+Id#6Z1DArf@4{9v12cs4xEax@D08`&vXKI^zKD!|FZK0ni z`9aO0e6(3sI6;`@KW(?D*F?)02bdL;JQ4+=%1pGy4J7Qc%5oTmfsKQJGIJywieE;J zg%!>s;{Oo+d9i4%5rm1={DNA`*H^3-+M;zo*c@3S7Da>-s}{8C0`c~+^HU`miQzZ6 zJu#W&H%_|Xc^=F!eE0dcg$ISx;^mX#<*4{hU+njf=ljL0-IXW${QgR$UpmM|dil}E zk$&~qkM!%k<+Vs(8!8}Ee6EB1!u@Nf=HiLDc>LZUrO3S5H*a=78fa#>vriw~>s@&F zapCn|>BjGOIpI4hto945J$>~but}cUW>5&hvBw8K3%lK?Kpdf@M4M9R504a9z`rSW{FNG zJcBC>aofSHFm855^cokrCou}N!TX=gN;Z*ZTHV{CZH(e#{tWOXE{aQ>ywE>L+IS6@ z`B4lr>2NYsBuN^GdjI#+0}JCl0I&6C_Y*MJjh|r_$i?ofdvET0Jqbn<>`QPc7o~GU Qz{xWO$ZySXVs;$ezw@M@X#fBK literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc b/tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a063375becde6ff48f52c527835c05f70f38849 GIT binary patch literal 23708 zcmeHPYj6}-cJ7|(dG# znJOjmrUa{9%PDWHO{mmt%1a=SS}K(v{+IQ?tyF%@s77T~S5cuVaT2QXLw;4c_?7S6 z+tbtCngMyeTd~V(wa%P-?m72y?xW9r{(UqW7V!Mrf4@0$woVZKgB<0@V6ry+(?@j5dBnUU~&!3$_&ZCoz`JA!B%Zgc#;=2mJ8C$TsJIE zq8=04gkC|D&U-F+UlGoWqVS3!;*We1{v(EX{+{ogKGfZJPYfqKMld}#IIfJ04I6>6 z^!d!lSlaLnr8GU2@EX$iM0!l66dJzs>WH3Bc+~*80xHY6W_X59Ym}ze*4BHwx4o#P zRc+hoS$*5nPaWPimcF15Wya4ZN5_?OnY6Y|Pis1dNo^uCq9<8ATPNNyWEMJitE&(m zRWI#l0F%P)P;^R}*`MEX>#;(puNdkph58l+(bxIGQ@4ZBX?f<*>;B8`yuaA6rPQ#cD0h_Pj)K(j z8T{~Cps&%GqDGRS6_2hWmI2O`I}y66-I^kX2rZ!c`4rVqeyQF zI#o6EIb$`GtQSqeQRxo|SHX3o%qlojQ>&qj$<>?5bTUirpBjp)QZ>9Ts#NL@jgU4- z)1Q$tNh4X|)lg2%if7>F&pV#!@G;@KfS$rLqM+0~TYFSu*C@lSyrk}h_fk;}j*m_Z zT(rWBsZ+OrkCdVP zE5M|%5ROdlza48un~Wt&vBaX_@$C~iobud`*3WF38O$FlM7xU7u2QrM4J))yyyaPl z#NU4A@-sa7ze>OV`mIg3hKrjAN}C5jD>gq-0*pLaiac3}Jc(8gAuh@x8agOw=`OmR z$By&1UX}UFJ92FD8Lb}Kxw>OtyYItxX&Q#|&5Kh<-+uY>%QMQQ*QQ^y$h>}26ICAqsGhke;bUfQ9cAv<(sDC+t0gNUlj|vy z_rx}2F^sRGs%z&*^wVk=!u{QHNNLr!0#FY7`8(5_uLpLm_t$&>Vebdt;-h;@kM0Fw z>ZrxSa$7x*U#|e~i6P|8$G*s4BU=wsw#)Ut&?dd%5%K5V1tdHfvpY@-y z(wYl+glxdsJ>`PgU_{7;vLWqnm9XN`qm@|BSl@A{%#Q?Zl}??k`J5qLvM1}(h3e#^ zCSR3r3|59C#0sa+nZFubfn{+8om`51MY`xmnTK`yDw~fZ1~MSnlMOp%ovuH_DCvTj z^Woo*{{a4jxsXmZ+QM;t1O+QJwmA^Kqt8bxK$zR$Bt499QbTW_rjr`fBN^DS`CV zeFJN}t1dXo#nN(Cc9upBu8@;UslVc9H9o2|D2>_Zh{#;AWz_$eGbE?Wtp8N^W&ES= zvO(*rBlmnGV;K7wUW{oBY2W@s#|=qKpY1gS*c+`VM?+T9+92!?ePn#>Acn4CHtb0b zjbzf?whi$iCXA1JeguZ!i$_x$2CRDP*tp)>-ZAmUu7nnT(?nYP8EI#k1O%iG^6de- zjrnd*Xo;=q3$V$xaIPZ}&Lt$n3kCw)3G5`W>n>S}Fd{qy{R#2n5enn+a=1Xus1Pg* z{;{$b5YwyI^rH9>NjTR!z>~K-p&8x`#^^MY6i1E35Gkp@dCHJym7WcL^iwnUmU4H(CaO{jmwvI16? z6-qVFey<@{96Rie$Lx;B?T$X~;7QzV_t|N8?Bd$tY`r}W*LICMS{hH;9zI-eVwxKp zXO{7)H<_IhYQl-JJAgvMK{kyH!n0cEz~tGG&C;;gxHo( z1S#hKoWSJ4#jxOO{z%$1CvD0sl{BLEdt$cTVmiRO%{9UGq{`iQ$}5 zX5pUPhqHj!V&s&c?yzV}2t2e}lJoJ6E>Sz>{<6qlIVC)J8v&A>#*|FyHJ$PnF2tSJ zdaWRMK6#85gf9=tlxB)TV0I~sc`3q&H+IXKtVTAZ{%3Sb54NoBQ>V+ZrR`R^L)K*j zD#4|8cuiUNcQU+GcG#Pf>~=pab?GZv^|0w~MxE9fEcIG;z02+u1)aw0=2OL*wAt-h zmj#gvC^GE)KsNX~D<@{{f2v#dqad6KmP3XW8G09Y>>#Algap@FsdmN=^)oP5v z9kss(*Z)f`VRn1-5p@qrfLmhWeb!@mv|e?9e4YR>!uzcUKHH7(A?tyA$6e}d4q}RW zA4$+shfzP8x{!qZuO%naD!1{U0{uEn?Ee1i;>8}aaCemLXdo^V8@4YQ(f!nH)T(pgmMi-esdaE!ol0w)N(MBrrtG^NEuj4(@A{Q(hvNPwBP64u}}^;L481F+kn z+f{|ge`1ljMY>BJ1cj|(*=$gAet!cqLb)*w8+56LbCB(2^7txOw?;n>*8jW>mSXq2 z-T4EzP8EW?i^1Kc;O^UBk-iv^BL2w(U%(=*gXwv^D6K6?YYWm^w;8*39a*v=X36@< zk_CVz8>wc=ejHvud#o7VSPE~1-7LqZv`c-{eFbUF%vpe`zL~R^`+@LxURqNs#VKUqnX*d=_OLB3OL9uHXk5}ps4(?$CeAT)))D~+a2|sNIR(6&o%0}L zb}5UQBe&(Y{QmM{;6S0P{7M64+jkAj$pZ!8?46eeN@d(6G9KW?vKQyEobck5c^NjT zDZZ3q58q<5WRXCpsEuGTA-SB(Ol4OkP2hnSkHdnT0%nuHoChl^i_?WHHC?SqVVX{P zu<4rpDwD#}P1jCo-ls zKsKO#T7Ak6(~bc)?FjBArefSNA!SV`RxuTAhHJ&7)Ggz(im7Oqus+kZCGa7RyQY_G zr)QNTuU0Cm7y(sN;pSsTKy^-+f$xad%9Y#nRO(DqF{sol4cVY+Du#6WLYa>vBrEQ$ zYl!1CL0~F2!c3$I>;TH&tE^(n2jOh^jAd-*WGo-Z&Jv`FY3(PCeS2u9K*RGRleQw7}eIZL}rO&NW zUz~kzo${tG#T2{P^Af!HVnM-(o~ClK%@) zNCYZN;VP|r8>_IsJD4u7*_t+s(QK~anr>^Y+^BP#H|kjmrGa8J%^r5rCD~)FUP2>F zNmaqG>I7)XTm2D%3k2Q(IHK+cytvP{WL{-YEB3PT&9*=)%kz%iw1yYEbg{}#yIR?X zR`rhwn;^gz?4`l+jHc5504(}9ux+gr7+=;$oY)-vePSjt#}Ym*`9DiO-vhWS0bt9p z*lS2EcRz#M5MR1Wn_=(LV*Xv4>PAAyLwy_Ip7vjoymZ!uWg9yTXy>bIKLFne@Fsa!i~nktoV4h#P7!#GKoY>dopq;r z$)fNg>(S~!v2PI|obOfbWZML|ZOklSY`=g_oYY2Xe=et|#&C>rctZUfghC3|LYNGm z6c)pR+<2cItk3*1{(k({!Qc7|P0tmZo+~vyS6KagQ94?Zjuxb&?xp{RriF(#F0?+f z(6WAEZR=v3?EwhV9smIB0cfbU2jC085P77$yuB!Reg1{u`i0y?3ljKW|6%|?6)MBZV$AUg@qJFfF7^qDRj78aF9Jwbb*XAzOw=xapP;eL zx4ol!(AB)`p~63ssEAf_-|eqZD`{c0iqnNvd|5b^zNsZN+|Q zIX1ngAT>|znc8z{@AO_m3GjDbYA#95MF#DZDeTlBAJ909jZmTO;Y=pSUP}ZVz&RiH zf99fX#JfSm3XLeAf>n^M-zfw0U;JWTPxtqxBotL^xWt>wO z3(z9-;r>&oVZ>*0uQNTl0K7P2K~4d)6*?E^&|t`DQpuNU9c|9=x&P(e{ zWt>xp16YxHaBtEtv2oXIBX@Cj;2g0ar-0WAoAcYHEM|^4653s8p}bG+!S4gKN-SXI zDloN*wA!@V@(0xFx2rPTDc9j|U1RGhtSi4#54EJ8Ik~4GneV*RQ!3+}$XI|DnGg4; z#zuT5_d3&)3&6_?&Q1Zd6*?E^(8bK1jyn9UX>2`(7Roy{wg+jISiox6BXtZmt+xCD zwfgO9Y`hms` zd2sn=Y>eWHk1vU+ic{ljBPemPZeNO zo{Viajl{iVn~hzfvdzY|G0zchh0C>ZOj%`avx(5=yKKbV2pDy2v$4l#ZGX!?Xxkc{ zWDntAMxL?ZbVV0j7fVh?e0U4<5BHe~TJc)Zz={S|H1Lhm0Imi+b;fpxi*;hwF|HM4 zMFT4uSkb^YTm!5V+kRaAQ&{|WY0(5Xs91+g*~*&w=b)-@5%@8I{RC*e=`M}e*y<5q zj$#X9mx=Zh0@DO&fhiHIeHxS&ba3%mS~u_eF#M?r+_$RG^(DIPgsAgKC_=2Z`g4(D zWQRf7YK|XAIMG8J;-LE1-8JKNsfmLK|HQz6)24EC3T{rpJiQaV;* z*wN46gN}aM)_)eGZ1E>Ri$4HZ{Hd$9_`_!4^J{)FP?S4Ma_8iMJJI;$!G&l&u7&hH z&cKwo5R82kY?%wT%x;|Bcjb}1rxeqMQ#L3p3rQps2 zd+*>f)y>nGzHggN;S$ahK&A#}Pt3_}!1y~awUx>^rw|9QBJ<$hQ#h7LY}_?_g1b07 zaE@4zQ^0G5&H3$87Bfc{zAXvm$>kRXHXFh6eM9WDw9%5}mkn*@%ayn(XseoL*E<;@ zi$r%6Ga742Mu4vD%#55;so@%a9)RI}YGj;Wgc<<0^;r5WzX2&~y7>Lhyb05_#QX#| zTf1e;v8Rbf2cy|qCu~BF>U2wh&LUY0SE(mP}=ey|lh@CSE?qL;i6|xhAMKO_=&;mr<*6IX7pBT?W z3O{a8=M_6=;e{VJh(yKi{P9HrKW@;vN_^rPr10YgTOSp-ez0m$z>gdJP+TiMa!X$n K@KX~ot^Xh0r`|dM literal 0 HcmV?d00001 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..766dbc3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,24 @@ +import importlib.util +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="session") +def split_fasta_module(): + """ + Load modules/ensembl/fasta/splitfasta/split_fasta.py as a Python module + regardless of whether 'modules/' is a Python package. + """ + repo_root = Path(__file__).resolve().parents[1] + module_path = ( + repo_root / "modules" / "ensembl" / "fasta" / "splitfasta" / "split_fasta.py" + ) + + spec = importlib.util.spec_from_file_location("split_fasta", module_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load module spec from {module_path}") + + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod diff --git a/tests/test_split_fasta.py b/tests/test_split_fasta.py new file mode 100644 index 0000000..8a48af2 --- /dev/null +++ b/tests/test_split_fasta.py @@ -0,0 +1,144 @@ +# tests/test_split_fasta.py +from pathlib import Path + +import pytest +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + + +def write_fasta(path: Path, records): + with open(path, "w", encoding="utf-8", newline="\n") as fh: + SeqIO.write(records, fh, "fasta") + + +def list_output_fastas(out_dir: Path): + return sorted(out_dir.rglob("*.fa")) + + +def read_all_ids_from_fastas(out_dir: Path): + ids = [] + for fa in list_output_fastas(out_dir): + with open(fa, "r", encoding="utf-8") as fh: + ids.extend([r.id for r in SeqIO.parse(fh, "fasta")]) + return ids + + +def parse_agp_lines(agp_path: Path): + lines = [l.rstrip("\n") for l in agp_path.read_text(encoding="utf-8").splitlines()] + lines = [l for l in lines if l and not l.startswith("#")] + return [l.split("\t") for l in lines] + + +def test_no_agp_by_default(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + write_fasta(inp, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=False, + ) + split_fasta_module.split_fasta(params) + + assert not (out / "in.agp").exists() + assert len(list_output_fastas(out)) >= 1 + + +def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + recs = [ + SeqRecord(Seq("A" * 10), id="s1", description=""), + SeqRecord(Seq("C" * 10), id="s2", description=""), + SeqRecord(Seq("G" * 10), id="s3", description=""), + ] + write_fasta(inp, recs) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + max_seqs_per_file=2, + write_agp=False, + ) + split_fasta_module.split_fasta(params) + + fas = list_output_fastas(out) + assert len(fas) == 2 + assert read_all_ids_from_fastas(out) == ["s1", "s2", "s3"] + + +def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_module): + """ + seq_len=2100, max=1000 -> chunks [1000, 1000, 100] + min_chunk_length=200 -> final chunk merged -> [1000, 1100] + """ + inp = tmp_path / "in.fa" + out = tmp_path / "out" + write_fasta(inp, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")]) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=True, + force_max_seq_length=True, + max_seq_length_per_file=1000, + min_chunk_length=200, + max_seqs_per_file=100000, # avoid seq-count splitting interfering + ) + split_fasta_module.split_fasta(params) + + # 2 chunks expected after merge + assert read_all_ids_from_fastas(out) == [ + "chr1_chunk_start_0", + "chr1_chunk_start_1000", + ] + + agp = out / "in.agp" + assert agp.exists() + + cols = parse_agp_lines(agp) + assert len(cols) == 2 + + # object, obj_beg, obj_end, part_no, type, comp_id, comp_beg, comp_end, orient + assert cols[0][0] == "chr1" + assert cols[0][1:4] == ["1", "1000", "1"] + assert cols[0][4] == "W" + assert cols[0][5] == "chr1_chunk_start_0" + assert cols[0][6:9] == ["1", "1000", "+"] + + assert cols[1][0] == "chr1" + assert cols[1][1:4] == ["1001", "2100", "2"] + assert cols[1][4] == "W" + assert cols[1][5] == "chr1_chunk_start_1000" + assert cols[1][6:9] == ["1", "1100", "+"] + + +def test_agp_part_numbers_restart_per_object(tmp_path: Path, split_fasta_module): + inp = tmp_path / "in.fa" + out = tmp_path / "out" + recs = [ + SeqRecord(Seq("A" * 1200), id="obj1", description=""), + SeqRecord(Seq("C" * 1200), id="obj2", description=""), + ] + write_fasta(inp, recs) + + params = split_fasta_module.Params( + fasta_file=inp, + out_dir=out, + write_agp=True, + force_max_seq_length=True, + max_seq_length_per_file=1000, + min_chunk_length=100, # => 2 chunks each, no merge + ) + split_fasta_module.split_fasta(params) + + cols = parse_agp_lines(out / "in.agp") + + by_obj = {} + for c in cols: + by_obj.setdefault(c[0], []).append(int(c[3])) + + assert by_obj["obj1"] == [1, 2] + assert by_obj["obj2"] == [1, 2] From 1dbf7ebbe7a14451b6747443851c22c8003771fe Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:34:48 +0000 Subject: [PATCH 06/36] Add Nextflow module and tests for running split_fasta.py --- .gitignore | 2 + .../ensembl/fasta/splitfasta/environment.yml | 8 + modules/ensembl/fasta/splitfasta/main.nf | 106 ++++++ .../fasta/splitfasta/tests/data/agp/test.agp | 4 + .../fasta/splitfasta/tests/data/real/in.fa | 6 + .../tests/data/splits/default/0/test.1.fa | 4 + .../tests/data/splits/default/0/test.2.fa | 2 + .../tests/data/splits/multi_dir/0/0/test.1.fa | 2 + .../tests/data/splits/multi_dir/0/1/test.2.fa | 2 + .../tests/data/splits/unique/0/test.0.1.fa | 2 + .../tests/data/splits/unique/0/test.0.2.fa | 2 + .../fasta/splitfasta/tests/main.nf.test | 301 ++++++++++++++++++ .../fasta/splitfasta/tests/main.nf.test.snap | 168 ++++++++++ 13 files changed, 609 insertions(+) create mode 100644 modules/ensembl/fasta/splitfasta/environment.yml create mode 100644 modules/ensembl/fasta/splitfasta/main.nf create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/real/in.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa create mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test create mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap diff --git a/.gitignore b/.gitignore index e75900d..e03c5c1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .nextflow* .nf-test* +__pycache__/ +.python-version \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/environment.yml b/modules/ensembl/fasta/splitfasta/environment.yml new file mode 100644 index 0000000..759f3da --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/environment.yml @@ -0,0 +1,8 @@ +--- +name: "fasta_splitfasta" +channels: + - conda-forge + - bioconda +dependencies: + - python=3.11.7 + - biopython=1.86 \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf new file mode 100644 index 0000000..0a8b761 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/main.nf @@ -0,0 +1,106 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FASTA_SPLITFASTA { + + tag "${meta.id}" + label 'process_low' + + publishDir "${params.outdir ?: '.'}", mode: 'copy' + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("**/*.fa"), emit: fasta + tuple val(meta), path("*.agp"), emit: agp, optional: true + + script: + def args = [] + + if (params.max_seqs_per_file) { + args << "--max-seqs-per-file ${params.max_seqs_per_file}" + } + + if (params.max_seq_length_per_file) { + args << "--max-seq-length-per-file ${params.max_seq_length_per_file}" + } + + if (params.min_chunk_length) { + args << "--min-chunk-length ${params.min_chunk_length}" + } + + if (params.max_files_per_directory) { + args << "--max-files-per-directory ${params.max_files_per_directory}" + } + + if (params.max_dirs_per_directory) { + args << "--max-dirs-per-directory ${params.max_dirs_per_directory}" + } + + if (params.force_max_seq_length) { + args << "--force-max-seq-length" + } + + if (params.write_agp) { + args << "--write-agp" + } + + if (params.unique_file_names) { + args << "--unique-file-names" + } + + if (params.delete_existing_files) { + args << "--delete-existing-files" + } + + """ + python \\ + ${moduleDir}/split_fasta.py \\ + --fasta-file \$PWD/${fasta} \\ + --out-dir \$PWD \\ + ${args.join(' ')} + """ + + stub: + """ + set -euo pipefail + + FIXTURE_DIR="${moduleDir}/tests/data" + + LAYOUT="default" + if [[ "${params.unique_file_names ?: false}" == "true" ]]; then + LAYOUT="unique" + elif [[ -n "${params.max_dirs_per_directory ?: ''}" || -n "${params.max_files_per_directory ?: ''}" ]]; then + LAYOUT="multi_dir" + fi + + mkdir -p splits + cp -R "\$FIXTURE_DIR/splits/\$LAYOUT/." "splits/" + + find splits -type f -name 'test*.fa' | while read -r f; do + bn=\$(basename "\$f") + dir=\$(dirname "\$f") + new_bn="\${bn/test/${meta.id}}" + mv "\$f" "\${dir}/\${new_bn}" + done + + if [[ "${params.write_agp ?: false}" == "true" ]]; then + cp "\$FIXTURE_DIR/agp/test.agp" "${meta.id}.agp" + fi + """ + + +} diff --git a/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp b/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp new file mode 100644 index 0000000..46fc419 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp @@ -0,0 +1,4 @@ +# AGP-version 2.0 +seq1 1 10 1 W seq1 1 10 + +seq2 1 10 1 W seq2 1 10 + +seq3 1 11 1 W seq3 1 11 + \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa b/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa new file mode 100644 index 0000000..3d3f65c --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa @@ -0,0 +1,6 @@ +>seq1 +AAAAAAAAAA +>seq2 +CCCCCCCCCC +>seq3 +GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa new file mode 100644 index 0000000..7abe938 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa @@ -0,0 +1,4 @@ +>seq1 +AAAAAAAAAA +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa new file mode 100644 index 0000000..6287efa --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa @@ -0,0 +1,2 @@ +>seq3 +GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa new file mode 100644 index 0000000..9512f36 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa @@ -0,0 +1,2 @@ +>seq1 +AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa new file mode 100644 index 0000000..2f3b40f --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa @@ -0,0 +1,2 @@ +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa new file mode 100644 index 0000000..9512f36 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa @@ -0,0 +1,2 @@ +>seq1 +AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa new file mode 100644 index 0000000..2f3b40f --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa @@ -0,0 +1,2 @@ +>seq2 +CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test b/modules/ensembl/fasta/splitfasta/tests/main.nf.test new file mode 100644 index 0000000..3db1283 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test @@ -0,0 +1,301 @@ +// nf-core modules test fasta/splitfasta +nextflow_process { + + name "Test Process FASTA_SPLITFASTA" + script "../main.nf" + process "FASTA_SPLITFASTA" + + tag "modules" + tag "modules_ensembl" + tag "fasta" + tag "fasta/splitfasta" + + + def real_fa = new File("modules/ensembl/fasta/splitfasta/tests/data/real/in.fa").canonicalFile + + test("Stub outputs: default layout, no AGP") { + + when { + options "-stub" + + // Ensure params are set explicitly for this test + params.write_agp = false + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + // fasta: tuple(meta, fa_paths) + assert process.out.fasta != null + assert process.out.fasta.size() == 1 + + def fasta_out = process.out.fasta[0] + def meta = fasta_out[0] + def fas = fasta_out[1] + + assert meta.id == "test" + assert fas != null + assert fas.size() == 2 + + // agp: tuple(meta, agp_paths) optional -> should be absent + assert process.out.agp != null + assert process.out.agp.size() == 0 + + // Ensure FASTA parsing works (downstream contract) + def merged = fas + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: AGP optional output appears when enabled") { + + when { + options "-stub" + + params.write_agp = true + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + assert process.out.fasta.size() == 1 + def fasta_out = process.out.fasta[0] + def fas = fasta_out[1] + assert fas.size() == 2 + + assert process.out.agp.size() == 1 + def agp_out = process.out.agp[0] + def agp_meta = agp_out[0] + def agp = agp_out[1] + def agp_paths = agp instanceof List ? agp : [agp] + def agp_file = path(agp_paths[0]).toFile() + + assert agp_meta.id == "test" + assert agp_paths.size() == 1 + assert agp_file.name == "test.agp" + + def agp_text = agp_file.text + assert agp_text.startsWith("# AGP-version 2.0") + assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") + assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") + assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: unique_file_names contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = true + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fasta_out = process.out.fasta[0] + def fas = fasta_out[1] + + assert fas.size() == 2 + assert process.out.agp.size() == 0 + + // Contract check: names match the unique fixture pattern + assert fas.collect { path(it).toFile().name }.sort() == ["test.0.1.fa", "test.0.2.fa"] + + assertAll( + { assert process.success } + ) + } + } + + test("Stub outputs: nested directory layout contract") { + + when { + options "-stub" + + params.write_agp = false + params.unique_file_names = false + + // Trigger stub's nested fixture selection + params.max_files_per_directory = 100 + params.max_dirs_per_directory = 100 + + process { + """ + input[0] = [[ id:'test' ], file('dummy.fa')] + """ + } + } + + then { + assert snapshot(process.out).match() + + def fastas = process.out.fasta[0][1] + assert fastas.size() == 2 + assert process.out.agp.size() == 0 + + def rels = fastas.collect { path(it).toString() } + assert rels.any { it.contains("splits/0/0/") } + assert rels.any { it.contains("splits/0/1/") } + + assertAll( + { assert process.success } + ) + } + } + + test("Real run: default behaviour produces FASTAs and no AGP") { + + when { + params.write_agp = false + params.unique_file_names = false + params.max_seqs_per_file = null + params.max_seq_length_per_file = null + params.max_files_per_directory = null + params.max_dirs_per_directory = null + params.force_max_seq_length = false + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + assert process.out.fasta != null + assert process.out.fasta.size() == 1 + + def out = process.out.fasta[0] + def meta = out[0] + def fas = out[1] + + assert meta.id == "test" + def fas_list = (fas instanceof List) ? fas : [fas] + assert fas_list.size() >= 1 + + assert process.out.agp != null + assert process.out.agp.size() == 0 + + def merged = fas_list + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + } + } + + test("Real run: write_agp=true emits exactly one AGP file") { + + when { + params.write_agp = true + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + params.max_seqs_per_file = null + params.max_seq_length_per_file = null + params.force_max_seq_length = false + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + assert process.out.agp != null + assert process.out.agp.size() == 1 + + def agp_out = process.out.agp[0] + def agp_meta = agp_out[0] + def agp_val = agp_out[1] + + assert agp_meta.id == "test" + + def agp_list = (agp_val instanceof List) ? agp_val : [agp_val] + assert agp_list.size() == 1 + + def agp_path = path(agp_list[0]) + assert agp_path.fileName.toString().endsWith(".agp") + + def agp_text = agp_path.toFile().text + assert agp_text.startsWith("# AGP-version 2.0") + assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") + assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") + assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") + } + } + + test("Real run: max_seqs_per_file=2 splits into 2 FASTA outputs") { + + when { + params.write_agp = false + params.max_seqs_per_file = 2 + params.unique_file_names = false + params.max_files_per_directory = null + params.max_dirs_per_directory = null + + process { + """ + input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] + """ + } + } + + then { + assert process.success + + def fas = process.out.fasta[0][1] + assert fas.size() == 2 + + def merged = fas + .collect { path(it).fasta } + .inject([:]) { acc, m -> acc + m } + + assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) + } + } +} diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap new file mode 100644 index 0000000..3390583 --- /dev/null +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap @@ -0,0 +1,168 @@ +{ + "Stub outputs: AGP optional output appears when enabled": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] + ], + "agp": [ + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:07.606463" + }, + "Stub outputs: nested directory layout contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:11.815126" + }, + "Stub outputs: default layout, no AGP": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:05.482323" + }, + "Stub outputs: unique_file_names contract": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ], + "1": [ + + ], + "agp": [ + + ], + "fasta": [ + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T10:38:09.698407" + } +} \ No newline at end of file From 2e62385135913b35874892a3ddedb17b0162ee13 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 14:43:29 +0000 Subject: [PATCH 07/36] Remove accidentally commited Python bytecode files --- .gitignore | 1 + .../conftest.cpython-311-pytest-9.0.2.pyc | Bin 1583 -> 0 bytes ...est_split_fasta.cpython-311-pytest-9.0.2.pyc | Bin 23708 -> 0 bytes 3 files changed, 1 insertion(+) delete mode 100644 tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc delete mode 100644 tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc diff --git a/.gitignore b/.gitignore index e03c5c1..961b31c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .nextflow* .nf-test* __pycache__/ +*.pyc .python-version \ No newline at end of file diff --git a/tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc b/tests/__pycache__/conftest.cpython-311-pytest-9.0.2.pyc deleted file mode 100644 index 187575a0d07bcbd9727705c0be55b2143b47d56e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1583 zcmZ`&&1)M+6rcT&cC{-ja*CU#YRVK!QWdJL$+d(M*AzES2rY)9%d*yvqIFiY)U{5r32AX7<1#hes&`RP(O%87OC;3eK&v#MIjkZ7_`>LcE~G z85vRwLR1@TaG>&6#CwLh<%fWd?E^r0Ug?_keK2*CN`RN5*+{Fvv%feN+65h+>kiM$ zRbr~fG!xUTpt};)$Kqfklj4JHil)D=){zc*G)LozcIA%zmDZ6rv{bHpp3bM3`^|Nv zAHa|DY(`)IpV!<(bAmcAY40l2{tn{Fm`Eq6qs)4gKu6)-GIqW(AYZG2d63R{?#^YF+Id#6Z1DArf@4{9v12cs4xEax@D08`&vXKI^zKD!|FZK0ni z`9aO0e6(3sI6;`@KW(?D*F?)02bdL;JQ4+=%1pGy4J7Qc%5oTmfsKQJGIJywieE;J zg%!>s;{Oo+d9i4%5rm1={DNA`*H^3-+M;zo*c@3S7Da>-s}{8C0`c~+^HU`miQzZ6 zJu#W&H%_|Xc^=F!eE0dcg$ISx;^mX#<*4{hU+njf=ljL0-IXW${QgR$UpmM|dil}E zk$&~qkM!%k<+Vs(8!8}Ee6EB1!u@Nf=HiLDc>LZUrO3S5H*a=78fa#>vriw~>s@&F zapCn|>BjGOIpI4hto945J$>~but}cUW>5&hvBw8K3%lK?Kpdf@M4M9R504a9z`rSW{FNG zJcBC>aofSHFm855^cokrCou}N!TX=gN;Z*ZTHV{CZH(e#{tWOXE{aQ>ywE>L+IS6@ z`B4lr>2NYsBuN^GdjI#+0}JCl0I&6C_Y*MJjh|r_$i?ofdvET0Jqbn<>`QPc7o~GU Qz{xWO$ZySXVs;$ezw@M@X#fBK diff --git a/tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc b/tests/__pycache__/test_split_fasta.cpython-311-pytest-9.0.2.pyc deleted file mode 100644 index 9a063375becde6ff48f52c527835c05f70f38849..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 23708 zcmeHPYj6}-cJ7|(dG# znJOjmrUa{9%PDWHO{mmt%1a=SS}K(v{+IQ?tyF%@s77T~S5cuVaT2QXLw;4c_?7S6 z+tbtCngMyeTd~V(wa%P-?m72y?xW9r{(UqW7V!Mrf4@0$woVZKgB<0@V6ry+(?@j5dBnUU~&!3$_&ZCoz`JA!B%Zgc#;=2mJ8C$TsJIE zq8=04gkC|D&U-F+UlGoWqVS3!;*We1{v(EX{+{ogKGfZJPYfqKMld}#IIfJ04I6>6 z^!d!lSlaLnr8GU2@EX$iM0!l66dJzs>WH3Bc+~*80xHY6W_X59Ym}ze*4BHwx4o#P zRc+hoS$*5nPaWPimcF15Wya4ZN5_?OnY6Y|Pis1dNo^uCq9<8ATPNNyWEMJitE&(m zRWI#l0F%P)P;^R}*`MEX>#;(puNdkph58l+(bxIGQ@4ZBX?f<*>;B8`yuaA6rPQ#cD0h_Pj)K(j z8T{~Cps&%GqDGRS6_2hWmI2O`I}y66-I^kX2rZ!c`4rVqeyQF zI#o6EIb$`GtQSqeQRxo|SHX3o%qlojQ>&qj$<>?5bTUirpBjp)QZ>9Ts#NL@jgU4- z)1Q$tNh4X|)lg2%if7>F&pV#!@G;@KfS$rLqM+0~TYFSu*C@lSyrk}h_fk;}j*m_Z zT(rWBsZ+OrkCdVP zE5M|%5ROdlza48un~Wt&vBaX_@$C~iobud`*3WF38O$FlM7xU7u2QrM4J))yyyaPl z#NU4A@-sa7ze>OV`mIg3hKrjAN}C5jD>gq-0*pLaiac3}Jc(8gAuh@x8agOw=`OmR z$By&1UX}UFJ92FD8Lb}Kxw>OtyYItxX&Q#|&5Kh<-+uY>%QMQQ*QQ^y$h>}26ICAqsGhke;bUfQ9cAv<(sDC+t0gNUlj|vy z_rx}2F^sRGs%z&*^wVk=!u{QHNNLr!0#FY7`8(5_uLpLm_t$&>Vebdt;-h;@kM0Fw z>ZrxSa$7x*U#|e~i6P|8$G*s4BU=wsw#)Ut&?dd%5%K5V1tdHfvpY@-y z(wYl+glxdsJ>`PgU_{7;vLWqnm9XN`qm@|BSl@A{%#Q?Zl}??k`J5qLvM1}(h3e#^ zCSR3r3|59C#0sa+nZFubfn{+8om`51MY`xmnTK`yDw~fZ1~MSnlMOp%ovuH_DCvTj z^Woo*{{a4jxsXmZ+QM;t1O+QJwmA^Kqt8bxK$zR$Bt499QbTW_rjr`fBN^DS`CV zeFJN}t1dXo#nN(Cc9upBu8@;UslVc9H9o2|D2>_Zh{#;AWz_$eGbE?Wtp8N^W&ES= zvO(*rBlmnGV;K7wUW{oBY2W@s#|=qKpY1gS*c+`VM?+T9+92!?ePn#>Acn4CHtb0b zjbzf?whi$iCXA1JeguZ!i$_x$2CRDP*tp)>-ZAmUu7nnT(?nYP8EI#k1O%iG^6de- zjrnd*Xo;=q3$V$xaIPZ}&Lt$n3kCw)3G5`W>n>S}Fd{qy{R#2n5enn+a=1Xus1Pg* z{;{$b5YwyI^rH9>NjTR!z>~K-p&8x`#^^MY6i1E35Gkp@dCHJym7WcL^iwnUmU4H(CaO{jmwvI16? z6-qVFey<@{96Rie$Lx;B?T$X~;7QzV_t|N8?Bd$tY`r}W*LICMS{hH;9zI-eVwxKp zXO{7)H<_IhYQl-JJAgvMK{kyH!n0cEz~tGG&C;;gxHo( z1S#hKoWSJ4#jxOO{z%$1CvD0sl{BLEdt$cTVmiRO%{9UGq{`iQ$}5 zX5pUPhqHj!V&s&c?yzV}2t2e}lJoJ6E>Sz>{<6qlIVC)J8v&A>#*|FyHJ$PnF2tSJ zdaWRMK6#85gf9=tlxB)TV0I~sc`3q&H+IXKtVTAZ{%3Sb54NoBQ>V+ZrR`R^L)K*j zD#4|8cuiUNcQU+GcG#Pf>~=pab?GZv^|0w~MxE9fEcIG;z02+u1)aw0=2OL*wAt-h zmj#gvC^GE)KsNX~D<@{{f2v#dqad6KmP3XW8G09Y>>#Algap@FsdmN=^)oP5v z9kss(*Z)f`VRn1-5p@qrfLmhWeb!@mv|e?9e4YR>!uzcUKHH7(A?tyA$6e}d4q}RW zA4$+shfzP8x{!qZuO%naD!1{U0{uEn?Ee1i;>8}aaCemLXdo^V8@4YQ(f!nH)T(pgmMi-esdaE!ol0w)N(MBrrtG^NEuj4(@A{Q(hvNPwBP64u}}^;L481F+kn z+f{|ge`1ljMY>BJ1cj|(*=$gAet!cqLb)*w8+56LbCB(2^7txOw?;n>*8jW>mSXq2 z-T4EzP8EW?i^1Kc;O^UBk-iv^BL2w(U%(=*gXwv^D6K6?YYWm^w;8*39a*v=X36@< zk_CVz8>wc=ejHvud#o7VSPE~1-7LqZv`c-{eFbUF%vpe`zL~R^`+@LxURqNs#VKUqnX*d=_OLB3OL9uHXk5}ps4(?$CeAT)))D~+a2|sNIR(6&o%0}L zb}5UQBe&(Y{QmM{;6S0P{7M64+jkAj$pZ!8?46eeN@d(6G9KW?vKQyEobck5c^NjT zDZZ3q58q<5WRXCpsEuGTA-SB(Ol4OkP2hnSkHdnT0%nuHoChl^i_?WHHC?SqVVX{P zu<4rpDwD#}P1jCo-ls zKsKO#T7Ak6(~bc)?FjBArefSNA!SV`RxuTAhHJ&7)Ggz(im7Oqus+kZCGa7RyQY_G zr)QNTuU0Cm7y(sN;pSsTKy^-+f$xad%9Y#nRO(DqF{sol4cVY+Du#6WLYa>vBrEQ$ zYl!1CL0~F2!c3$I>;TH&tE^(n2jOh^jAd-*WGo-Z&Jv`FY3(PCeS2u9K*RGRleQw7}eIZL}rO&NW zUz~kzo${tG#T2{P^Af!HVnM-(o~ClK%@) zNCYZN;VP|r8>_IsJD4u7*_t+s(QK~anr>^Y+^BP#H|kjmrGa8J%^r5rCD~)FUP2>F zNmaqG>I7)XTm2D%3k2Q(IHK+cytvP{WL{-YEB3PT&9*=)%kz%iw1yYEbg{}#yIR?X zR`rhwn;^gz?4`l+jHc5504(}9ux+gr7+=;$oY)-vePSjt#}Ym*`9DiO-vhWS0bt9p z*lS2EcRz#M5MR1Wn_=(LV*Xv4>PAAyLwy_Ip7vjoymZ!uWg9yTXy>bIKLFne@Fsa!i~nktoV4h#P7!#GKoY>dopq;r z$)fNg>(S~!v2PI|obOfbWZML|ZOklSY`=g_oYY2Xe=et|#&C>rctZUfghC3|LYNGm z6c)pR+<2cItk3*1{(k({!Qc7|P0tmZo+~vyS6KagQ94?Zjuxb&?xp{RriF(#F0?+f z(6WAEZR=v3?EwhV9smIB0cfbU2jC085P77$yuB!Reg1{u`i0y?3ljKW|6%|?6)MBZV$AUg@qJFfF7^qDRj78aF9Jwbb*XAzOw=xapP;eL zx4ol!(AB)`p~63ssEAf_-|eqZD`{c0iqnNvd|5b^zNsZN+|Q zIX1ngAT>|znc8z{@AO_m3GjDbYA#95MF#DZDeTlBAJ909jZmTO;Y=pSUP}ZVz&RiH zf99fX#JfSm3XLeAf>n^M-zfw0U;JWTPxtqxBotL^xWt>wO z3(z9-;r>&oVZ>*0uQNTl0K7P2K~4d)6*?E^&|t`DQpuNU9c|9=x&P(e{ zWt>xp16YxHaBtEtv2oXIBX@Cj;2g0ar-0WAoAcYHEM|^4653s8p}bG+!S4gKN-SXI zDloN*wA!@V@(0xFx2rPTDc9j|U1RGhtSi4#54EJ8Ik~4GneV*RQ!3+}$XI|DnGg4; z#zuT5_d3&)3&6_?&Q1Zd6*?E^(8bK1jyn9UX>2`(7Roy{wg+jISiox6BXtZmt+xCD zwfgO9Y`hms` zd2sn=Y>eWHk1vU+ic{ljBPemPZeNO zo{Viajl{iVn~hzfvdzY|G0zchh0C>ZOj%`avx(5=yKKbV2pDy2v$4l#ZGX!?Xxkc{ zWDntAMxL?ZbVV0j7fVh?e0U4<5BHe~TJc)Zz={S|H1Lhm0Imi+b;fpxi*;hwF|HM4 zMFT4uSkb^YTm!5V+kRaAQ&{|WY0(5Xs91+g*~*&w=b)-@5%@8I{RC*e=`M}e*y<5q zj$#X9mx=Zh0@DO&fhiHIeHxS&ba3%mS~u_eF#M?r+_$RG^(DIPgsAgKC_=2Z`g4(D zWQRf7YK|XAIMG8J;-LE1-8JKNsfmLK|HQz6)24EC3T{rpJiQaV;* z*wN46gN}aM)_)eGZ1E>Ri$4HZ{Hd$9_`_!4^J{)FP?S4Ma_8iMJJI;$!G&l&u7&hH z&cKwo5R82kY?%wT%x;|Bcjb}1rxeqMQ#L3p3rQps2 zd+*>f)y>nGzHggN;S$ahK&A#}Pt3_}!1y~awUx>^rw|9QBJ<$hQ#h7LY}_?_g1b07 zaE@4zQ^0G5&H3$87Bfc{zAXvm$>kRXHXFh6eM9WDw9%5}mkn*@%ayn(XseoL*E<;@ zi$r%6Ga742Mu4vD%#55;so@%a9)RI}YGj;Wgc<<0^;r5WzX2&~y7>Lhyb05_#QX#| zTf1e;v8Rbf2cy|qCu~BF>U2wh&LUY0SE(mP}=ey|lh@CSE?qL;i6|xhAMKO_=&;mr<*6IX7pBT?W z3O{a8=M_6=;e{VJh(yKi{P9HrKW@;vN_^rPr10YgTOSp-ez0m$z>gdJP+TiMa!X$n K@KX~ot^Xh0r`|dM From af21a08044b4262f178be15f3a868949f5f8b82d Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 30 Jan 2026 15:47:18 +0000 Subject: [PATCH 08/36] Docstring updates and minor pytest refactor --- .../ensembl/fasta/splitfasta/split_fasta.py | 140 ++++++++++++++++-- tests/test_split_fasta.py | 87 +++++++---- 2 files changed, 186 insertions(+), 41 deletions(-) diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py index 164ec44..5f4b0e1 100644 --- a/modules/ensembl/fasta/splitfasta/split_fasta.py +++ b/modules/ensembl/fasta/splitfasta/split_fasta.py @@ -1,6 +1,25 @@ #!/usr/bin/env python3 -"""Split a FASTA file (possibly gzipped) into multiple smaller FASTA files.""" +""" +Split a FASTA file into multiple FASTA files, optionally chunking long sequences. + +This script reads an input FASTA (optionally gzipped) and writes one or more FASTA +files to an output directory. Records can be split across output files either by: + +- maximum number of records per file (``max_seqs_per_file``), and/or +- maximum cumulative sequence length per file (``max_seq_length_per_file``). + +If ``force_max_seq_length`` is enabled, individual sequences longer than +``max_seq_length_per_file`` are split into chunks. When chunking, a final remainder +chunk shorter than ``min_chunk_length`` can be merged into the previous chunk. + +Optionally, an AGP v2.0 file can be written describing how each input sequence +maps to output contigs/chunks. + +The implementation is designed to stream the input once and write outputs in a +single pass. +""" + import inspect import logging @@ -37,7 +56,19 @@ def init_logging_with_args(args): class Params: - """Class to hold parameters for splitting FASTA files.""" + """ + Validated configuration for splitting a FASTA file. + + Attributes correspond to CLI arguments and control: + - output location and cleanup behaviour, + - how records are grouped into output FASTA files, + - whether long sequences are chunked, and + - whether to write an AGP file describing the splits. + + Validation is performed in ``_validate_params()`` and will raise ``ValueError`` + for invalid combinations (e.g. ``min_chunk_length`` without + ``max_seq_length_per_file``). + """ def __init__( self, @@ -70,6 +101,13 @@ def __init__( self._validate_params() def _validate_params(self) -> None: + """ + Validate parameter values and combinations. + + Raises: + ValueError: If any numeric limit is <= 0, or if ``min_chunk_length`` is + set without ``max_seq_length_per_file``. + """ if self.max_dirs_per_directory is not None and self.max_dirs_per_directory <= 0: raise ValueError("--max-dirs-per-directory must be > 0 or None") if ( @@ -95,8 +133,22 @@ def _validate_params(self) -> None: class OutputWriter: """ - Manages output file creation and counters, writing in a single pass. - Creates/cleans directories lazily as required. + Write split FASTA outputs and (optionally) an AGP file. + + The writer manages: + - output directory creation/cleanup (lazy, per-directory), + - output file naming (optionally unique across directories), + - record and length counters used to decide when to roll over to a new file, + - an optional AGP v2.0 file describing the mapping from original sequences + to output contigs/chunks. + + Notes: + Output layout is controlled by: + - ``max_files_per_directory``: how many FASTA files to write per directory + before incrementing the directory index. + - ``max_dirs_per_directory``: how directory indices are expanded into a + multi-level path (base-N style). + - ``unique_file_names``: whether to include directory index in filenames. """ def __init__(self, params: Params): @@ -135,7 +187,14 @@ def _create_or_clean_dir(self, dir_path: Path) -> None: raise def _get_subdir_path(self, dir_index: int) -> Path: - """Computes subdirectory path based on dir_index and max_dirs_per_directory.""" + """Return the output subdirectory path for a given directory index. + + Args: + dir_index: Zero-based directory index computed from file count. + + Returns: + A Path under ``params.out_dir`` into which output files are written. + """ parts = [] max_dirs = self.params.max_dirs_per_directory if max_dirs is None: @@ -150,9 +209,16 @@ def _get_subdir_path(self, dir_index: int) -> Path: return self.params.out_dir.joinpath(*parts) def _get_file_and_dir_index(self) -> Tuple[int, int]: - """ - Determines index of file and directory based on file count and max files per directory. - Returns (file_index, dir_index). + """Compute the file index within a directory and the directory index. + + ``file_count`` increments monotonically for each output file. If + ``max_files_per_directory`` is set, files are grouped into directories such + that each directory contains at most that many files. + + Returns: + (file_index, dir_index) where: + - file_index is 1-based within the directory, and + - dir_index is 0-based across directories. """ max_files = self.params.max_files_per_directory if max_files is None: @@ -182,10 +248,18 @@ def add_agp_entry( part_id: str, part_length: int, ) -> None: - """Adds an entry to the AGP file.""" - # AGP columns for WGS contig component type: - # object, object_beg, object_end, part_number, component_type, - # component_id, component_beg, component_end, orientation + """ + Write a single AGP v2.0 component line for a chunk/contig. + Coordinates written to AGP are 1-based and inclusive. + + Args: + object_id: The original input sequence ID (AGP 'object'). + start: Start coordinate on the object (1-based, inclusive). + end: End coordinate on the object (1-based, inclusive). + part_nr: Component part number for this object (starts at 1 per object). + part_id: Output contig/chunk identifier (AGP 'component_id'). + part_length: Length of the component in bases. + """ if self._agp_fh is None: return try: @@ -224,7 +298,7 @@ def open_new_file(self) -> None: self.file_len = 0 def write_record(self, record: SeqRecord) -> None: - """Writes a SeqRecord to the current output file.""" + """Writes a SeqRecord to the current output file and update counters.""" try: SeqIO.write(record, self._fh, "fasta") self.record_count += 1 @@ -243,7 +317,11 @@ def close(self) -> None: def _get_param_defaults() -> dict: - """Retrieve default values for Params class attributes.""" + """ + Return default values from the ``Params`` constructor signature. + + Keeps CLI help text in sync with the defaults defined in ``Params.__init__``. + """ signature = inspect.signature(Params.__init__) defaults = {} for name, param in signature.parameters.items(): @@ -253,7 +331,30 @@ def _get_param_defaults() -> dict: def split_fasta(params: Params) -> None: - """Splits the input FASTA file into multiple smaller FASTA files, chunking long sequences if required.""" + """ + Split an input FASTA into multiple output FASTA files. + + Records are streamed from the input file and written in a single pass. + Output file rollover can be triggered by: + - exceeding ``max_seqs_per_file`` (record-count based), and/or + - exceeding ``max_seq_length_per_file`` (cumulative sequence length per file). + + If ``force_max_seq_length`` is enabled and an individual record is longer than + ``max_seq_length_per_file``, the sequence is split into fixed-size chunks. + If the final remainder chunk is shorter than ``min_chunk_length``, it is merged + with the previous chunk (which may exceed ``max_seq_length_per_file``). + + When ``write_agp`` is enabled, an AGP v2.0 file is written describing the + mapping from each original sequence to its output contigs/chunks. + + Args: + params: Validated configuration controlling splitting/chunking behaviour. + + Raises: + FileNotFoundError: If the input FASTA does not exist. + ValueError: If parameter validation fails (raised when Params is created). + Exception: Propagates unexpected I/O or parsing errors. + """ if not params.fasta_file.exists(): logging.error( "DEBUG: fasta_file=%r resolved=%r cwd=%r", @@ -363,6 +464,15 @@ def split_fasta(params: Params) -> None: def parse_args(argv: Optional[List[str]] = None) -> Params: + """ + Parse CLI arguments and return a validated Params object. + + Args: + argv: Optional argument list for testing. If None, uses sys.argv. + + Returns: + A validated Params instance. + """ defaults = _get_param_defaults() parser = ArgumentParser( description="Split a FASTA file into multiple FASTA files, optionally chunking long sequences." diff --git a/tests/test_split_fasta.py b/tests/test_split_fasta.py index 8a48af2..0b5fb20 100644 --- a/tests/test_split_fasta.py +++ b/tests/test_split_fasta.py @@ -8,15 +8,18 @@ def write_fasta(path: Path, records): + """Write a list of SeqRecord objects to a FASTA file.""" with open(path, "w", encoding="utf-8", newline="\n") as fh: SeqIO.write(records, fh, "fasta") def list_output_fastas(out_dir: Path): + """Return all FASTA files produced under the output directory.""" return sorted(out_dir.rglob("*.fa")) def read_all_ids_from_fastas(out_dir: Path): + """Read and return all sequence IDs from all FASTA files under out_dir.""" ids = [] for fa in list_output_fastas(out_dir): with open(fa, "r", encoding="utf-8") as fh: @@ -25,18 +28,26 @@ def read_all_ids_from_fastas(out_dir: Path): def parse_agp_lines(agp_path: Path): + """ + Parse an AGP file into a list of column lists, excluding comments + and blank lines. + """ lines = [l.rstrip("\n") for l in agp_path.read_text(encoding="utf-8").splitlines()] lines = [l for l in lines if l and not l.startswith("#")] return [l.split("\t") for l in lines] def test_no_agp_by_default(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" + """ + By default, splitting a FASTA should produce one or more FASTA outputs + but must NOT create an AGP file unless write_agp is explicitly enabled. + """ + input_fasta = tmp_path / "in.fa" out = tmp_path / "out" - write_fasta(inp, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) + write_fasta(input_fasta, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) params = split_fasta_module.Params( - fasta_file=inp, + fasta_file=input_fasta, out_dir=out, write_agp=False, ) @@ -47,17 +58,22 @@ def test_no_agp_by_default(tmp_path: Path, split_fasta_module): def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" + """ + When max_seqs_per_file is set, sequences should be split across + multiple FASTA files while preserving original sequence order + and IDs. + """ + input_fasta = tmp_path / "in.fa" out = tmp_path / "out" recs = [ SeqRecord(Seq("A" * 10), id="s1", description=""), SeqRecord(Seq("C" * 10), id="s2", description=""), SeqRecord(Seq("G" * 10), id="s3", description=""), ] - write_fasta(inp, recs) + write_fasta(input_fasta, recs) params = split_fasta_module.Params( - fasta_file=inp, + fasta_file=input_fasta, out_dir=out, max_seqs_per_file=2, write_agp=False, @@ -71,15 +87,19 @@ def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_module): """ - seq_len=2100, max=1000 -> chunks [1000, 1000, 100] - min_chunk_length=200 -> final chunk merged -> [1000, 1100] + When force_max_seq_length is enabled, long sequences are chunked. + If the final chunk is shorter than min_chunk_length, it should be + merged with the previous chunk, and the AGP file must reflect the + merged coordinates correctly. """ - inp = tmp_path / "in.fa" + input_fasta = tmp_path / "in.fa" out = tmp_path / "out" - write_fasta(inp, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")]) + write_fasta( + input_fasta, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")] + ) params = split_fasta_module.Params( - fasta_file=inp, + fasta_file=input_fasta, out_dir=out, write_agp=True, force_max_seq_length=True, @@ -101,31 +121,46 @@ def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_modul cols = parse_agp_lines(agp) assert len(cols) == 2 - # object, obj_beg, obj_end, part_no, type, comp_id, comp_beg, comp_end, orient - assert cols[0][0] == "chr1" - assert cols[0][1:4] == ["1", "1000", "1"] - assert cols[0][4] == "W" - assert cols[0][5] == "chr1_chunk_start_0" - assert cols[0][6:9] == ["1", "1000", "+"] - - assert cols[1][0] == "chr1" - assert cols[1][1:4] == ["1001", "2100", "2"] - assert cols[1][4] == "W" - assert cols[1][5] == "chr1_chunk_start_1000" - assert cols[1][6:9] == ["1", "1100", "+"] + # object, obj_start, obj_end, part_no, type, comp_id, comp_start, comp_end, orientation + assert cols[0] == [ + "chr1", + "1", + "1000", + "1", + "W", + "chr1_chunk_start_0", + "1", + "1000", + "+", + ] + assert cols[1] == [ + "chr1", + "1001", + "2100", + "2", + "W", + "chr1_chunk_start_1000", + "1", + "1100", + "+", + ] def test_agp_part_numbers_restart_per_object(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" + """ + AGP part numbers must restart at 1 for each new input sequence + (object), even when multiple sequences are chunked in the same run. + """ + input_fasta = tmp_path / "in.fa" out = tmp_path / "out" recs = [ SeqRecord(Seq("A" * 1200), id="obj1", description=""), SeqRecord(Seq("C" * 1200), id="obj2", description=""), ] - write_fasta(inp, recs) + write_fasta(input_fasta, recs) params = split_fasta_module.Params( - fasta_file=inp, + fasta_file=input_fasta, out_dir=out, write_agp=True, force_max_seq_length=True, From ac505b8714bd0cbf235995b1f20b1e8f7fa74cb2 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Mon, 2 Feb 2026 12:09:06 +0000 Subject: [PATCH 09/36] Header updates --- .../ensembl/fasta/splitfasta/split_fasta.py | 24 ++++++++++++++++++- .../fasta/splitfasta/tests/main.nf.test | 15 ++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py index 5f4b0e1..b25846c 100644 --- a/modules/ensembl/fasta/splitfasta/split_fasta.py +++ b/modules/ensembl/fasta/splitfasta/split_fasta.py @@ -1,4 +1,19 @@ -#!/usr/bin/env python3 +#!env python3 + +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Split a FASTA file into multiple FASTA files, optionally chunking long sequences. @@ -480,11 +495,13 @@ def parse_args(argv: Optional[List[str]] = None) -> Params: parser.add_argument( "--fasta-file", type=Path, + metavar="FASTA", required=True, help="Input raw or compressed FASTA file containing sequences to split", ) parser.add_argument( "--out-dir", + metavar="DIR", type=Path, help="Top-level output directory (default: input FASTA directory)", ) @@ -495,27 +512,32 @@ def parse_args(argv: Optional[List[str]] = None) -> Params: ) parser.add_argument( "--max-seqs-per-file", + metavar="N", type=int, help=f"Max records per output file (default: {defaults['max_seqs_per_file']})", ) parser.add_argument( "--max-seq-length-per-file", type=int, + metavar="BP", help=f"Max cumulative sequence length per output file (default: {defaults['max_seq_length_per_file']})", ) parser.add_argument( "--min-chunk-length", type=int, + metavar="BP", help=f"Minimum length of a chunk allowed as a remainder (default: {defaults['min_chunk_length']})", ) parser.add_argument( "--max-files-per-directory", type=int, + metavar="N", help=f"Max files per directory before moving to next computed dir (default: {defaults['max_files_per_directory']})", ) parser.add_argument( "--max-dirs-per-directory", type=int, + metavar="N", help=f"Max subdirectories per directory level (default: {defaults['max_dirs_per_directory']})", ) parser.add_argument( diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test b/modules/ensembl/fasta/splitfasta/tests/main.nf.test index 3db1283..c23c0cd 100644 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test @@ -1,3 +1,18 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // nf-core modules test fasta/splitfasta nextflow_process { From 1ee480d5f3f806b7a3bccb9c202c8db2dd69e9b7 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 3 Feb 2026 11:18:02 +0000 Subject: [PATCH 10/36] Moved python stuff to ensembl-genomio --- .../ensembl/fasta/splitfasta/environment.yml | 3 +- modules/ensembl/fasta/splitfasta/main.nf | 3 + .../fasta/splitfasta/tests/main.nf.test.snap | 102 ++-------- tests/conftest.py | 24 --- tests/test_split_fasta.py | 179 ------------------ 5 files changed, 18 insertions(+), 293 deletions(-) delete mode 100644 tests/conftest.py delete mode 100644 tests/test_split_fasta.py diff --git a/modules/ensembl/fasta/splitfasta/environment.yml b/modules/ensembl/fasta/splitfasta/environment.yml index 759f3da..2d01414 100644 --- a/modules/ensembl/fasta/splitfasta/environment.yml +++ b/modules/ensembl/fasta/splitfasta/environment.yml @@ -4,5 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - python=3.11.7 - - biopython=1.86 \ No newline at end of file + - ensembl-genomio=1.6.1 \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf index 0a8b761..590a7ff 100644 --- a/modules/ensembl/fasta/splitfasta/main.nf +++ b/modules/ensembl/fasta/splitfasta/main.nf @@ -18,6 +18,9 @@ process FASTA_SPLITFASTA { tag "${meta.id}" label 'process_low' + conda "${moduleDir}/environment.yml" + container "ensemblorg/ensembl-genomio:v1.6.1" + publishDir "${params.outdir ?: '.'}", mode: 'copy' input: diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap index 3390583..a27a644 100644 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap @@ -3,42 +3,16 @@ "content": [ { "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] + ], "1": [ - [ - { - "id": "test" - }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" - ] + ], "agp": [ - [ - { - "id": "test" - }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" - ] + ], "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] + ] } ], @@ -46,21 +20,13 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-01-30T10:38:07.606463" + "timestamp": "2026-02-03T11:07:14.941473" }, "Stub outputs: nested directory layout contract": { "content": [ { "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] + ], "1": [ @@ -69,15 +35,7 @@ ], "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] + ] } ], @@ -85,21 +43,13 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-01-30T10:38:11.815126" + "timestamp": "2026-02-03T11:07:18.579901" }, "Stub outputs: default layout, no AGP": { "content": [ { "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] + ], "1": [ @@ -108,15 +58,7 @@ ], "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] + ] } ], @@ -124,21 +66,13 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-01-30T10:38:05.482323" + "timestamp": "2026-02-03T11:07:13.112305" }, "Stub outputs: unique_file_names contract": { "content": [ { "0": [ - [ - { - "id": "test" - }, - [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] + ], "1": [ @@ -147,15 +81,7 @@ ], "fasta": [ - [ - { - "id": "test" - }, - [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] + ] } ], @@ -163,6 +89,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-01-30T10:38:09.698407" + "timestamp": "2026-02-03T11:07:16.747928" } } \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 766dbc3..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,24 +0,0 @@ -import importlib.util -from pathlib import Path - -import pytest - - -@pytest.fixture(scope="session") -def split_fasta_module(): - """ - Load modules/ensembl/fasta/splitfasta/split_fasta.py as a Python module - regardless of whether 'modules/' is a Python package. - """ - repo_root = Path(__file__).resolve().parents[1] - module_path = ( - repo_root / "modules" / "ensembl" / "fasta" / "splitfasta" / "split_fasta.py" - ) - - spec = importlib.util.spec_from_file_location("split_fasta", module_path) - if spec is None or spec.loader is None: - raise RuntimeError(f"Could not load module spec from {module_path}") - - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod diff --git a/tests/test_split_fasta.py b/tests/test_split_fasta.py deleted file mode 100644 index 0b5fb20..0000000 --- a/tests/test_split_fasta.py +++ /dev/null @@ -1,179 +0,0 @@ -# tests/test_split_fasta.py -from pathlib import Path - -import pytest -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord - - -def write_fasta(path: Path, records): - """Write a list of SeqRecord objects to a FASTA file.""" - with open(path, "w", encoding="utf-8", newline="\n") as fh: - SeqIO.write(records, fh, "fasta") - - -def list_output_fastas(out_dir: Path): - """Return all FASTA files produced under the output directory.""" - return sorted(out_dir.rglob("*.fa")) - - -def read_all_ids_from_fastas(out_dir: Path): - """Read and return all sequence IDs from all FASTA files under out_dir.""" - ids = [] - for fa in list_output_fastas(out_dir): - with open(fa, "r", encoding="utf-8") as fh: - ids.extend([r.id for r in SeqIO.parse(fh, "fasta")]) - return ids - - -def parse_agp_lines(agp_path: Path): - """ - Parse an AGP file into a list of column lists, excluding comments - and blank lines. - """ - lines = [l.rstrip("\n") for l in agp_path.read_text(encoding="utf-8").splitlines()] - lines = [l for l in lines if l and not l.startswith("#")] - return [l.split("\t") for l in lines] - - -def test_no_agp_by_default(tmp_path: Path, split_fasta_module): - """ - By default, splitting a FASTA should produce one or more FASTA outputs - but must NOT create an AGP file unless write_agp is explicitly enabled. - """ - input_fasta = tmp_path / "in.fa" - out = tmp_path / "out" - write_fasta(input_fasta, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) - - params = split_fasta_module.Params( - fasta_file=input_fasta, - out_dir=out, - write_agp=False, - ) - split_fasta_module.split_fasta(params) - - assert not (out / "in.agp").exists() - assert len(list_output_fastas(out)) >= 1 - - -def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): - """ - When max_seqs_per_file is set, sequences should be split across - multiple FASTA files while preserving original sequence order - and IDs. - """ - input_fasta = tmp_path / "in.fa" - out = tmp_path / "out" - recs = [ - SeqRecord(Seq("A" * 10), id="s1", description=""), - SeqRecord(Seq("C" * 10), id="s2", description=""), - SeqRecord(Seq("G" * 10), id="s3", description=""), - ] - write_fasta(input_fasta, recs) - - params = split_fasta_module.Params( - fasta_file=input_fasta, - out_dir=out, - max_seqs_per_file=2, - write_agp=False, - ) - split_fasta_module.split_fasta(params) - - fas = list_output_fastas(out) - assert len(fas) == 2 - assert read_all_ids_from_fastas(out) == ["s1", "s2", "s3"] - - -def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_module): - """ - When force_max_seq_length is enabled, long sequences are chunked. - If the final chunk is shorter than min_chunk_length, it should be - merged with the previous chunk, and the AGP file must reflect the - merged coordinates correctly. - """ - input_fasta = tmp_path / "in.fa" - out = tmp_path / "out" - write_fasta( - input_fasta, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")] - ) - - params = split_fasta_module.Params( - fasta_file=input_fasta, - out_dir=out, - write_agp=True, - force_max_seq_length=True, - max_seq_length_per_file=1000, - min_chunk_length=200, - max_seqs_per_file=100000, # avoid seq-count splitting interfering - ) - split_fasta_module.split_fasta(params) - - # 2 chunks expected after merge - assert read_all_ids_from_fastas(out) == [ - "chr1_chunk_start_0", - "chr1_chunk_start_1000", - ] - - agp = out / "in.agp" - assert agp.exists() - - cols = parse_agp_lines(agp) - assert len(cols) == 2 - - # object, obj_start, obj_end, part_no, type, comp_id, comp_start, comp_end, orientation - assert cols[0] == [ - "chr1", - "1", - "1000", - "1", - "W", - "chr1_chunk_start_0", - "1", - "1000", - "+", - ] - assert cols[1] == [ - "chr1", - "1001", - "2100", - "2", - "W", - "chr1_chunk_start_1000", - "1", - "1100", - "+", - ] - - -def test_agp_part_numbers_restart_per_object(tmp_path: Path, split_fasta_module): - """ - AGP part numbers must restart at 1 for each new input sequence - (object), even when multiple sequences are chunked in the same run. - """ - input_fasta = tmp_path / "in.fa" - out = tmp_path / "out" - recs = [ - SeqRecord(Seq("A" * 1200), id="obj1", description=""), - SeqRecord(Seq("C" * 1200), id="obj2", description=""), - ] - write_fasta(input_fasta, recs) - - params = split_fasta_module.Params( - fasta_file=input_fasta, - out_dir=out, - write_agp=True, - force_max_seq_length=True, - max_seq_length_per_file=1000, - min_chunk_length=100, # => 2 chunks each, no merge - ) - split_fasta_module.split_fasta(params) - - cols = parse_agp_lines(out / "in.agp") - - by_obj = {} - for c in cols: - by_obj.setdefault(c[0], []).append(int(c[3])) - - assert by_obj["obj1"] == [1, 2] - assert by_obj["obj2"] == [1, 2] From 66550dcc4f376a2fb9cddce5cc6e033ff0e352de Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 3 Feb 2026 11:45:43 +0000 Subject: [PATCH 11/36] Test fixes --- modules/ensembl/fasta/splitfasta/main.nf | 7 -- .../fasta/splitfasta/tests/main.nf.test.snap | 110 +++++++++++++++--- 2 files changed, 92 insertions(+), 25 deletions(-) diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf index 590a7ff..8871477 100644 --- a/modules/ensembl/fasta/splitfasta/main.nf +++ b/modules/ensembl/fasta/splitfasta/main.nf @@ -93,13 +93,6 @@ process FASTA_SPLITFASTA { mkdir -p splits cp -R "\$FIXTURE_DIR/splits/\$LAYOUT/." "splits/" - find splits -type f -name 'test*.fa' | while read -r f; do - bn=\$(basename "\$f") - dir=\$(dirname "\$f") - new_bn="\${bn/test/${meta.id}}" - mv "\$f" "\${dir}/\${new_bn}" - done - if [[ "${params.write_agp ?: false}" == "true" ]]; then cp "\$FIXTURE_DIR/agp/test.agp" "${meta.id}.agp" fi diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap index a27a644..7c44fbc 100644 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap @@ -3,30 +3,64 @@ "content": [ { "0": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] ], "1": [ - + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] ], "agp": [ - + [ + { + "id": "test" + }, + "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] ] } ], "meta": { "nf-test": "0.9.3", - "nextflow": "25.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2026-02-03T11:07:14.941473" + "timestamp": "2026-02-03T11:44:20.723299027" }, "Stub outputs: nested directory layout contract": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] ], "1": [ @@ -35,21 +69,37 @@ ], "fasta": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] ] } ], "meta": { "nf-test": "0.9.3", - "nextflow": "25.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2026-02-03T11:07:18.579901" + "timestamp": "2026-02-03T11:44:45.167257411" }, "Stub outputs: default layout, no AGP": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] ], "1": [ @@ -58,21 +108,37 @@ ], "fasta": [ - + [ + { + "id": "test" + }, + [ + "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", + "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + ] + ] ] } ], "meta": { "nf-test": "0.9.3", - "nextflow": "25.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2026-02-03T11:07:13.112305" + "timestamp": "2026-02-03T11:44:08.447183258" }, "Stub outputs: unique_file_names contract": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] ], "1": [ @@ -81,14 +147,22 @@ ], "fasta": [ - + [ + { + "id": "test" + }, + [ + "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", + "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + ] + ] ] } ], "meta": { "nf-test": "0.9.3", - "nextflow": "25.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2026-02-03T11:07:16.747928" + "timestamp": "2026-02-03T11:44:33.225993321" } } \ No newline at end of file From da555a10b9010d46f81119c08eb12435aa60ba72 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 3 Feb 2026 20:15:04 +0000 Subject: [PATCH 12/36] Actually remove python script! --- .../ensembl/fasta/splitfasta/split_fasta.py | 594 ------------------ 1 file changed, 594 deletions(-) delete mode 100644 modules/ensembl/fasta/splitfasta/split_fasta.py diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py deleted file mode 100644 index b25846c..0000000 --- a/modules/ensembl/fasta/splitfasta/split_fasta.py +++ /dev/null @@ -1,594 +0,0 @@ -#!env python3 - -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Split a FASTA file into multiple FASTA files, optionally chunking long sequences. - -This script reads an input FASTA (optionally gzipped) and writes one or more FASTA -files to an output directory. Records can be split across output files either by: - -- maximum number of records per file (``max_seqs_per_file``), and/or -- maximum cumulative sequence length per file (``max_seq_length_per_file``). - -If ``force_max_seq_length`` is enabled, individual sequences longer than -``max_seq_length_per_file`` are split into chunks. When chunking, a final remainder -chunk shorter than ``min_chunk_length`` can be merged into the previous chunk. - -Optionally, an AGP v2.0 file can be written describing how each input sequence -maps to output contigs/chunks. - -The implementation is designed to stream the input once and write outputs in a -single pass. -""" - - -import inspect -import logging -import shutil -from pathlib import Path -from typing import Optional, List, Set, Tuple - -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord - -try: - from ensembl.utils.archive import open_gz_file # type: ignore -except ImportError: - import gzip - - def open_gz_file(path): - p = str(path) - return gzip.open(p, "rt") if p.endswith(".gz") else open(p, "rt") - - -try: - from ensembl.utils.argparse import ArgumentParser # type: ignore -except ImportError: - from argparse import ArgumentParser - -try: - from ensembl.utils.logging import init_logging_with_args # type: ignore -except ImportError: - import logging - - def init_logging_with_args(args): - level = getattr(args, "log_level", "INFO") - logging.basicConfig(level=level) - - -class Params: - """ - Validated configuration for splitting a FASTA file. - - Attributes correspond to CLI arguments and control: - - output location and cleanup behaviour, - - how records are grouped into output FASTA files, - - whether long sequences are chunked, and - - whether to write an AGP file describing the splits. - - Validation is performed in ``_validate_params()`` and will raise ``ValueError`` - for invalid combinations (e.g. ``min_chunk_length`` without - ``max_seq_length_per_file``). - """ - - def __init__( - self, - fasta_file: Path, - out_dir: Optional[Path] = None, - write_agp: bool = False, - max_seqs_per_file: Optional[int] = None, - max_seq_length_per_file: Optional[int] = None, - min_chunk_length: Optional[int] = None, - max_files_per_directory: Optional[int] = None, - max_dirs_per_directory: Optional[int] = None, - delete_existing_files: bool = False, - unique_file_names: bool = False, - delete_original_file: bool = False, - force_max_seq_length: bool = False, - ): - self.fasta_file = fasta_file - self.out_dir = out_dir if out_dir is not None else fasta_file.parent - self.write_agp = write_agp - self.max_seqs_per_file = max_seqs_per_file - self.max_seq_length_per_file = max_seq_length_per_file - self.min_chunk_length = min_chunk_length - self.max_files_per_directory = max_files_per_directory - self.max_dirs_per_directory = max_dirs_per_directory - self.delete_existing_files = delete_existing_files - self.unique_file_names = unique_file_names - self.delete_original_file = delete_original_file - self.force_max_seq_length = force_max_seq_length - - self._validate_params() - - def _validate_params(self) -> None: - """ - Validate parameter values and combinations. - - Raises: - ValueError: If any numeric limit is <= 0, or if ``min_chunk_length`` is - set without ``max_seq_length_per_file``. - """ - if self.max_dirs_per_directory is not None and self.max_dirs_per_directory <= 0: - raise ValueError("--max-dirs-per-directory must be > 0 or None") - if ( - self.max_files_per_directory is not None - and self.max_files_per_directory <= 0 - ): - raise ValueError("--max-files-per-directory must be > 0 or None") - if self.max_seqs_per_file is not None and self.max_seqs_per_file <= 0: - raise ValueError("--max-seqs-per-file must be > 0 or None") - if ( - self.max_seq_length_per_file is not None - and self.max_seq_length_per_file <= 0 - ): - raise ValueError("--max-seq-length-per-file must be > 0 or None") - if self.min_chunk_length is not None: - if self.max_seq_length_per_file is None: - raise ValueError( - "--min-chunk-length requires --max-seq-length-per-file" - ) - if self.min_chunk_length <= 0: - raise ValueError("--min-chunk-length must be > 0") - - -class OutputWriter: - """ - Write split FASTA outputs and (optionally) an AGP file. - - The writer manages: - - output directory creation/cleanup (lazy, per-directory), - - output file naming (optionally unique across directories), - - record and length counters used to decide when to roll over to a new file, - - an optional AGP v2.0 file describing the mapping from original sequences - to output contigs/chunks. - - Notes: - Output layout is controlled by: - - ``max_files_per_directory``: how many FASTA files to write per directory - before incrementing the directory index. - - ``max_dirs_per_directory``: how directory indices are expanded into a - multi-level path (base-N style). - - ``unique_file_names``: whether to include directory index in filenames. - """ - - def __init__(self, params: Params): - self.params = params - self.basename = ( - params.fasta_file.name.removesuffix(".gz") - .removesuffix(".fa") - .removesuffix(".fasta") - ) - self.agp_file = ( - self.params.out_dir.joinpath(self.basename + ".agp") - if params.write_agp - else None - ) - self.file_count = 0 - self.record_count = 0 - self.file_len = 0 - self._fh = None - self._agp_fh = None - self._cleaned_dirs: Set[Path] = set() - - self.open_new_file() - - def _create_or_clean_dir(self, dir_path: Path) -> None: - try: - dir_path.mkdir(parents=True, exist_ok=True) - if self.params.delete_existing_files and dir_path not in self._cleaned_dirs: - for child in dir_path.iterdir(): - if child.is_dir(): - shutil.rmtree(child) - else: - child.unlink() - self._cleaned_dirs.add(dir_path) - except Exception: - logging.exception("Failed to prepare output directory '%s'", dir_path) - raise - - def _get_subdir_path(self, dir_index: int) -> Path: - """Return the output subdirectory path for a given directory index. - - Args: - dir_index: Zero-based directory index computed from file count. - - Returns: - A Path under ``params.out_dir`` into which output files are written. - """ - parts = [] - max_dirs = self.params.max_dirs_per_directory - if max_dirs is None: - parts.append("1") - else: - current_index = dir_index - while current_index >= 0: - parts.append(f"{current_index % max_dirs}") - current_index = current_index // max_dirs - 1 - - parts.reverse() - return self.params.out_dir.joinpath(*parts) - - def _get_file_and_dir_index(self) -> Tuple[int, int]: - """Compute the file index within a directory and the directory index. - - ``file_count`` increments monotonically for each output file. If - ``max_files_per_directory`` is set, files are grouped into directories such - that each directory contains at most that many files. - - Returns: - (file_index, dir_index) where: - - file_index is 1-based within the directory, and - - dir_index is 0-based across directories. - """ - max_files = self.params.max_files_per_directory - if max_files is None: - return self.file_count, 0 - adjusted_count = self.file_count - 1 - return (adjusted_count % max_files + 1, adjusted_count // max_files) - - def _get_path_for_next_file(self) -> Path: - """Computes path for the next output file.""" - self.file_count += 1 - file_index, dir_index = self._get_file_and_dir_index() - subdir_path = self._get_subdir_path(dir_index) - self._create_or_clean_dir(subdir_path) - - if self.params.unique_file_names: - file_name = f"{self.basename}.{dir_index}.{file_index}.fa" - else: - file_name = f"{self.basename}.{file_index}.fa" - return subdir_path.joinpath(file_name) - - def add_agp_entry( - self, - object_id: str, - start: int, - end: int, - part_nr: int, - part_id: str, - part_length: int, - ) -> None: - """ - Write a single AGP v2.0 component line for a chunk/contig. - Coordinates written to AGP are 1-based and inclusive. - - Args: - object_id: The original input sequence ID (AGP 'object'). - start: Start coordinate on the object (1-based, inclusive). - end: End coordinate on the object (1-based, inclusive). - part_nr: Component part number for this object (starts at 1 per object). - part_id: Output contig/chunk identifier (AGP 'component_id'). - part_length: Length of the component in bases. - """ - if self._agp_fh is None: - return - try: - line = f"{object_id}\t{start}\t{end}\t{part_nr}\tW\t{part_id}\t1\t{part_length}\t+\n" - self._agp_fh.write(line) - except Exception: - logging.exception("Failed to write AGP entry for part '%s'", part_id) - raise - - def create_agp_file(self) -> None: - """Creates the AGP file for recording sequence chunking.""" - if self.agp_file is None: - return - try: - self.params.out_dir.mkdir(parents=True, exist_ok=True) - self._agp_fh = open(self.agp_file, "w") - self._agp_fh.write("# AGP-version 2.0\n") - logging.info("Created AGP file '%s'", self.agp_file) - except Exception: - logging.exception("Failed to open AGP file '%s'", self.agp_file) - raise - - def open_new_file(self) -> None: - """Closes current file (if any) and opens a new output file.""" - if self._fh is not None: - self._fh.close() - - path = self._get_path_for_next_file() - try: - self._fh = open(path, "w") - logging.debug("Opened output file '%s'", path) - except Exception: - logging.exception("Failed to open output file '%s'", path) - raise - self.record_count = 0 - self.file_len = 0 - - def write_record(self, record: SeqRecord) -> None: - """Writes a SeqRecord to the current output file and update counters.""" - try: - SeqIO.write(record, self._fh, "fasta") - self.record_count += 1 - self.file_len += len(record.seq) - except Exception: - logging.exception("Failed to write record '%s' to output file", record.id) - raise - - def close(self) -> None: - if self._fh is not None: - self._fh.close() - self._fh = None - if self._agp_fh is not None: - self._agp_fh.close() - self._agp_fh = None - - -def _get_param_defaults() -> dict: - """ - Return default values from the ``Params`` constructor signature. - - Keeps CLI help text in sync with the defaults defined in ``Params.__init__``. - """ - signature = inspect.signature(Params.__init__) - defaults = {} - for name, param in signature.parameters.items(): - if name != "self" and param.default is not inspect.Parameter.empty: - defaults[name] = param.default - return defaults - - -def split_fasta(params: Params) -> None: - """ - Split an input FASTA into multiple output FASTA files. - - Records are streamed from the input file and written in a single pass. - Output file rollover can be triggered by: - - exceeding ``max_seqs_per_file`` (record-count based), and/or - - exceeding ``max_seq_length_per_file`` (cumulative sequence length per file). - - If ``force_max_seq_length`` is enabled and an individual record is longer than - ``max_seq_length_per_file``, the sequence is split into fixed-size chunks. - If the final remainder chunk is shorter than ``min_chunk_length``, it is merged - with the previous chunk (which may exceed ``max_seq_length_per_file``). - - When ``write_agp`` is enabled, an AGP v2.0 file is written describing the - mapping from each original sequence to its output contigs/chunks. - - Args: - params: Validated configuration controlling splitting/chunking behaviour. - - Raises: - FileNotFoundError: If the input FASTA does not exist. - ValueError: If parameter validation fails (raised when Params is created). - Exception: Propagates unexpected I/O or parsing errors. - """ - if not params.fasta_file.exists(): - logging.error( - "DEBUG: fasta_file=%r resolved=%r cwd=%r", - str(params.fasta_file), - str(Path(params.fasta_file).resolve()), - str(Path.cwd()), - ) - raise FileNotFoundError(f"Fasta file '{params.fasta_file}' does not exist") - - # Do nothing if file size is 0 - if params.fasta_file.stat().st_size == 0: - logging.info("Input FASTA '%s' is empty; nothing to do", params.fasta_file) - return - - params.out_dir.mkdir(parents=True, exist_ok=True) - - writer = OutputWriter(params) - - try: - if params.write_agp: - writer.create_agp_file() - - with open_gz_file(params.fasta_file) as fh: - for record in SeqIO.parse(fh, "fasta"): - seq_len = len(record.seq) - max_seq_len = params.max_seq_length_per_file - max_seqs = params.max_seqs_per_file - - if max_seqs is not None and writer.record_count >= max_seqs: - writer.open_new_file() - - if max_seq_len is None or writer.file_len + seq_len <= max_seq_len: - writer.write_record(record) - if params.write_agp: - writer.add_agp_entry( - record.id, 1, seq_len, 1, record.id, seq_len - ) - continue - - if params.force_max_seq_length and seq_len > max_seq_len: - starts = list(range(0, seq_len, max_seq_len)) - ends = [min(s + max_seq_len, seq_len) for s in starts] - - if params.min_chunk_length is not None and len(starts) > 1: - last_chunk_len = ends[-1] - starts[-1] - if last_chunk_len < params.min_chunk_length: - logging.warning( - "Length of last chunk of record '%s' is %d, lower than min_chunk_length: %d;" - + "merging with previous chunk", - record.id, - last_chunk_len, - params.min_chunk_length, - ) - ends[-2] = seq_len - starts.pop() - ends.pop() - - for i, (start, end) in enumerate(zip(starts, ends), start=1): - chunk_seq = record.seq[start:end] - chunk_record = SeqRecord( - chunk_seq, - id=f"{record.id}_chunk_start_{start}", - description=f"{record.description} (part {i})", - ) - if writer.record_count > 0: - writer.open_new_file() - writer.write_record(chunk_record) - - if params.write_agp: - writer.add_agp_entry( - record.id, - start + 1, - end, - i, - chunk_record.id, - len(chunk_seq), - ) - else: - logging.warning( - "Record '%s' length %d exceeds max_seq_length_per_file %d but chunking not enabled", - record.id, - seq_len, - max_seq_len, - ) - if writer.record_count > 0: - writer.open_new_file() - writer.write_record(record) - if params.write_agp: - writer.add_agp_entry( - record.id, 1, seq_len, 1, record.id, seq_len - ) - except Exception: - logging.exception("Error processing FASTA file '%s'", params.fasta_file) - raise - finally: - writer.close() - - if params.delete_original_file: - try: - params.fasta_file.unlink(missing_ok=True) - except Exception: - logging.warning( - "Failed to delete original FASTA file '%s'", - params.fasta_file, - exc_info=True, - ) - - -def parse_args(argv: Optional[List[str]] = None) -> Params: - """ - Parse CLI arguments and return a validated Params object. - - Args: - argv: Optional argument list for testing. If None, uses sys.argv. - - Returns: - A validated Params instance. - """ - defaults = _get_param_defaults() - parser = ArgumentParser( - description="Split a FASTA file into multiple FASTA files, optionally chunking long sequences." - ) - parser.add_argument( - "--fasta-file", - type=Path, - metavar="FASTA", - required=True, - help="Input raw or compressed FASTA file containing sequences to split", - ) - parser.add_argument( - "--out-dir", - metavar="DIR", - type=Path, - help="Top-level output directory (default: input FASTA directory)", - ) - parser.add_argument( - "--write-agp", - action="store_true", - help=f"Write AGP file describing the splits (default: {defaults['write_agp']})", - ) - parser.add_argument( - "--max-seqs-per-file", - metavar="N", - type=int, - help=f"Max records per output file (default: {defaults['max_seqs_per_file']})", - ) - parser.add_argument( - "--max-seq-length-per-file", - type=int, - metavar="BP", - help=f"Max cumulative sequence length per output file (default: {defaults['max_seq_length_per_file']})", - ) - parser.add_argument( - "--min-chunk-length", - type=int, - metavar="BP", - help=f"Minimum length of a chunk allowed as a remainder (default: {defaults['min_chunk_length']})", - ) - parser.add_argument( - "--max-files-per-directory", - type=int, - metavar="N", - help=f"Max files per directory before moving to next computed dir (default: {defaults['max_files_per_directory']})", - ) - parser.add_argument( - "--max-dirs-per-directory", - type=int, - metavar="N", - help=f"Max subdirectories per directory level (default: {defaults['max_dirs_per_directory']})", - ) - parser.add_argument( - "--delete-existing-files", - action="store_true", - help=f"Delete existing files within computed output dirs (default: {defaults['delete_existing_files']})", - ) - parser.add_argument( - "--unique-file-names", - action="store_true", - help=f"Make output file names unique across dirs by including dir_index (default: {defaults['unique_file_names']})", - ) - parser.add_argument( - "--delete-original-file", - action="store_true", - help=f"Delete original input FASTA after splitting (default: {defaults['delete_original_file']})", - ) - parser.add_argument( - "--force-max-seq-length", - action="store_true", - help=f"Chunk single sequences longer than max-seq-length-per-file (default: {defaults['force_max_seq_length']})", - ) - - args = parser.parse_args(argv) - init_logging_with_args(args) - - params = Params( - fasta_file=args.fasta_file, - out_dir=args.out_dir, - write_agp=args.write_agp, - max_seqs_per_file=args.max_seqs_per_file, - max_seq_length_per_file=args.max_seq_length_per_file, - min_chunk_length=args.min_chunk_length, - max_files_per_directory=args.max_files_per_directory, - max_dirs_per_directory=args.max_dirs_per_directory, - delete_existing_files=args.delete_existing_files, - unique_file_names=args.unique_file_names, - delete_original_file=args.delete_original_file, - force_max_seq_length=args.force_max_seq_length, - ) - return params - - -def main(argv: Optional[List[str]] = None) -> None: - try: - params = parse_args(argv) - split_fasta(params) - except Exception: - logging.exception("Error processing FASTA file '%s'", params.fasta_file) - raise - - -if __name__ == "__main__": - main() From ad779fd0c66073eb589054b155f7bc1ad80d45b3 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 3 Feb 2026 21:27:30 +0000 Subject: [PATCH 13/36] Update call to splitting script --- modules/ensembl/fasta/splitfasta/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf index 8871477..1d11362 100644 --- a/modules/ensembl/fasta/splitfasta/main.nf +++ b/modules/ensembl/fasta/splitfasta/main.nf @@ -71,7 +71,7 @@ process FASTA_SPLITFASTA { """ python \\ - ${moduleDir}/split_fasta.py \\ + fasta_split \\ --fasta-file \$PWD/${fasta} \\ --out-dir \$PWD \\ ${args.join(' ')} From 1934c1f692430cf906d5a3b7d3d0ed8e108529b2 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 10 Feb 2026 15:18:33 +0000 Subject: [PATCH 14/36] Add FASTA recombination tests --- .../{splitfasta => recombine}/environment.yml | 2 +- modules/ensembl/fasta/recombine/main.nf | 78 ++++++++++ .../recombine/tests/data/agp/output/test.fa | 2 + .../recombine/tests/data/agp/splits/part1.fa | 2 + .../recombine/tests/data/agp/splits/part2.fa | 2 + .../fasta/recombine/tests/data/agp/test.agp | 3 + .../tests/data/custom_regex/output/test.fa | 2 + .../tests/data/custom_regex/splits/seq1_1.fa | 2 + .../tests/data/custom_regex/splits/seq1_5.fa | 2 + .../tests/data/extra_suffix/output/test.fa | 2 + .../splits/seq1_chunk_start_1.fsa | 2 + .../splits/seq1_chunk_start_5.fsa | 2 + .../tests/data/header/output/test.fa | 4 + .../data/header/splits/seq1_chunk_start_1.fa | 2 + .../data/header/splits/seq1_chunk_start_5.fa | 2 + .../tests/data/header/splits/seq2.fa | 2 + .../fasta/recombine/tests/main.nf.test | 140 ++++++++++++++++++ .../fasta/recombine/tests/main.nf.test.snap | 104 +++++++++++++ modules/ensembl/fasta/split/environment.yml | 7 + .../fasta/{splitfasta => split}/main.nf | 21 ++- .../tests/data/agp/test.agp | 0 .../tests/data/real/in.fa | 0 .../tests/data/splits/default/0/test.1.fa | 0 .../tests/data/splits/default/0/test.2.fa | 0 .../tests/data/splits/multi_dir/0/0/test.1.fa | 0 .../tests/data/splits/multi_dir/0/1/test.2.fa | 0 .../tests/data/splits/unique/0/test.0.1.fa | 0 .../tests/data/splits/unique/0/test.0.2.fa | 0 .../{splitfasta => split}/tests/main.nf.test | 10 +- .../tests/main.nf.test.snap | 0 30 files changed, 376 insertions(+), 17 deletions(-) rename modules/ensembl/fasta/{splitfasta => recombine}/environment.yml (76%) create mode 100644 modules/ensembl/fasta/recombine/main.nf create mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/test.agp create mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa create mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa create mode 100644 modules/ensembl/fasta/recombine/tests/data/header/output/test.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa create mode 100644 modules/ensembl/fasta/recombine/tests/main.nf.test create mode 100644 modules/ensembl/fasta/recombine/tests/main.nf.test.snap create mode 100644 modules/ensembl/fasta/split/environment.yml rename modules/ensembl/fasta/{splitfasta => split}/main.nf (86%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/agp/test.agp (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/real/in.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/default/0/test.1.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/default/0/test.2.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/multi_dir/0/0/test.1.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/multi_dir/0/1/test.2.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/unique/0/test.0.1.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/data/splits/unique/0/test.0.2.fa (100%) rename modules/ensembl/fasta/{splitfasta => split}/tests/main.nf.test (97%) rename modules/ensembl/fasta/{splitfasta => split}/tests/main.nf.test.snap (100%) diff --git a/modules/ensembl/fasta/splitfasta/environment.yml b/modules/ensembl/fasta/recombine/environment.yml similarity index 76% rename from modules/ensembl/fasta/splitfasta/environment.yml rename to modules/ensembl/fasta/recombine/environment.yml index 2d01414..52b218c 100644 --- a/modules/ensembl/fasta/splitfasta/environment.yml +++ b/modules/ensembl/fasta/recombine/environment.yml @@ -1,5 +1,5 @@ --- -name: "fasta_splitfasta" +name: "fasta_recombine" channels: - conda-forge - bioconda diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf new file mode 100644 index 0000000..064558c --- /dev/null +++ b/modules/ensembl/fasta/recombine/main.nf @@ -0,0 +1,78 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FASTA_RECOMBINE { + + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "ensemblorg/ensembl-genomio:v1.6.1" + + publishDir "${params.outdir ?: '.'}", mode: 'copy' + + input: + tuple val(meta), path(fasta_dir), path(agp) + + output: + tuple val(meta), path("*.fa"), emit: fasta + + script: + def args = [] + + if (params.extra_suffixes) { + args << "--extra-suffixes ${params.extra_suffixes}" + } + + if (params.chunk_id_regex) { + args << "--chunk-id-regex ${params.chunk_id_regex}" + } + + if (params.allow_revcomp) { + args << "--allow-revcomp" + } + + if (agp) { + args << "--agp-file '${agp}'" + } + + def out_fasta = "${meta.id}.fa" + + """ + fasta_recombine \\ + --in-dir ${fasta_dir} \\ + --out-fasta ${out_fasta} \\ + ${args.join(' ')} + """ + + stub: + """ + set -euo pipefail + + test_data_dir="${moduleDir}/tests/data" + + out_fasta="${meta.id}.fa" + + mode="header" + if [[ -n "${agp ?: ''}" ]]; then + MODE="agp" + fi + + cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$OUT_FASTA" + + """ + + +} diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa new file mode 100644 index 0000000..b53532e --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa @@ -0,0 +1,2 @@ +>seq1 +AAAAAACCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa b/modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa new file mode 100644 index 0000000..dafb755 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa @@ -0,0 +1,2 @@ +>part1 +AAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa b/modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa new file mode 100644 index 0000000..0fc377e --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa @@ -0,0 +1,2 @@ +>part2 +CCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/test.agp b/modules/ensembl/fasta/recombine/tests/data/agp/test.agp new file mode 100644 index 0000000..a73c8db --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/agp/test.agp @@ -0,0 +1,3 @@ +##agp-version 2.0 +seq1 1 6 1 W part1 1 6 + +seq1 7 12 2 W part2 1 6 + \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa new file mode 100644 index 0000000..46d11a6 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa @@ -0,0 +1,2 @@ +>seq1 +CCCCGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa new file mode 100644 index 0000000..0af2767 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa @@ -0,0 +1,2 @@ +>seqY_1 +CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa new file mode 100644 index 0000000..c722026 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa @@ -0,0 +1,2 @@ +>seqY_5 +GGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa new file mode 100644 index 0000000..121d453 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa @@ -0,0 +1,2 @@ +>seq1 +TTTTAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa new file mode 100644 index 0000000..17d88e1 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa @@ -0,0 +1,2 @@ +>seq1_chunk_start_1 +AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa new file mode 100644 index 0000000..b6646f2 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa @@ -0,0 +1,2 @@ +>seq1_chunk_start_5 +CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa new file mode 100644 index 0000000..d3bbb3d --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa @@ -0,0 +1,4 @@ +>seq1 +AAAACCCC +>seq2 +GGGGTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa new file mode 100644 index 0000000..17d88e1 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa @@ -0,0 +1,2 @@ +>seq1_chunk_start_1 +AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa new file mode 100644 index 0000000..b6646f2 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa @@ -0,0 +1,2 @@ +>seq1_chunk_start_5 +CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa new file mode 100644 index 0000000..70d86fb --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa @@ -0,0 +1,2 @@ +>seq2 +GGGGTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test new file mode 100644 index 0000000..b965d38 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -0,0 +1,140 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nf-core modules test fasta/recombine +nextflow_process { + + name "Test Process FASTA_RECOMBINE" + script "../main.nf" + process "FASTA_RECOMBINE" + + tag "modules" + tag "modules_ensembl" + tag "fasta" + tag "fasta/recombine" + + + test("Stub outputs: header mode") { + + when { + options "-stub" + + process { + """ + input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/splits'), []] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } + + + test("Stub outputs: AGP mode") { + + when { + options "-stub" + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/agp/splits'), + file('${moduleDir}/tests/data/agp/test.agp')] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } + + + test("Real run: header recombination") { + + when { + process { + """ + input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/splits'), []] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } + + + test("Real run: AGP recombination") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/agp/splits'), + file('${moduleDir}/tests/data/agp/test.agp')] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } + + + test("Real run: extra suffix support") { + + when { + params.extra_suffixes = ".fsa" + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/extra_suffix/splits'), + []] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } + + + test("Real run: custom chunk regex") { + + when { + params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/custom_regex/splits'), + []] + """ + } + } + + then { + assert snapshot(process.out).match() + } + } +} + diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap new file mode 100644 index 0000000..bf1e160 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -0,0 +1,104 @@ +{ + "Stub outputs: AGP mode": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:42.590604" + }, + "Real run: header recombination": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:44.398141" + }, + "Real run: AGP recombination": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:46.212088" + }, + "Stub outputs: header mode": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:40.786056" + }, + "Real run: extra suffix support": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:47.991813" + }, + "Real run: custom chunk regex": { + "content": [ + { + "0": [ + + ], + "fasta": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-10T15:17:49.822476" + } +} \ No newline at end of file diff --git a/modules/ensembl/fasta/split/environment.yml b/modules/ensembl/fasta/split/environment.yml new file mode 100644 index 0000000..208dc35 --- /dev/null +++ b/modules/ensembl/fasta/split/environment.yml @@ -0,0 +1,7 @@ +--- +name: "fasta_split" +channels: + - conda-forge + - bioconda +dependencies: + - ensembl-genomio=1.6.1 \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/split/main.nf similarity index 86% rename from modules/ensembl/fasta/splitfasta/main.nf rename to modules/ensembl/fasta/split/main.nf index 1d11362..fd53d9f 100644 --- a/modules/ensembl/fasta/splitfasta/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,10 +13,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -process FASTA_SPLITFASTA { +process FASTA_SPLIT { tag "${meta.id}" - label 'process_low' + label 'process_medium' conda "${moduleDir}/environment.yml" container "ensemblorg/ensembl-genomio:v1.6.1" @@ -70,9 +70,8 @@ process FASTA_SPLITFASTA { } """ - python \\ - fasta_split \\ - --fasta-file \$PWD/${fasta} \\ + fasta_split \\ + --fasta-file ${fasta} \\ --out-dir \$PWD \\ ${args.join(' ')} """ @@ -81,20 +80,20 @@ process FASTA_SPLITFASTA { """ set -euo pipefail - FIXTURE_DIR="${moduleDir}/tests/data" + test_data_dir="${moduleDir}/tests/data" - LAYOUT="default" + layout="default" if [[ "${params.unique_file_names ?: false}" == "true" ]]; then - LAYOUT="unique" + layout="unique" elif [[ -n "${params.max_dirs_per_directory ?: ''}" || -n "${params.max_files_per_directory ?: ''}" ]]; then - LAYOUT="multi_dir" + layout="multi_dir" fi mkdir -p splits - cp -R "\$FIXTURE_DIR/splits/\$LAYOUT/." "splits/" + cp -R "\$test_data_dir/splits/\$layout/." "splits/" if [[ "${params.write_agp ?: false}" == "true" ]]; then - cp "\$FIXTURE_DIR/agp/test.agp" "${meta.id}.agp" + cp "\$test_data_dir/agp/test.agp" "${meta.id}.agp" fi """ diff --git a/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp b/modules/ensembl/fasta/split/tests/data/agp/test.agp similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp rename to modules/ensembl/fasta/split/tests/data/agp/test.agp diff --git a/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa b/modules/ensembl/fasta/split/tests/data/real/in.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/real/in.fa rename to modules/ensembl/fasta/split/tests/data/real/in.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa b/modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa rename to modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa b/modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa rename to modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa b/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa rename to modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa b/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa rename to modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa b/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa rename to modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa b/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa rename to modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test similarity index 97% rename from modules/ensembl/fasta/splitfasta/tests/main.nf.test rename to modules/ensembl/fasta/split/tests/main.nf.test index c23c0cd..cf4206f 100644 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -13,20 +13,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -// nf-core modules test fasta/splitfasta +// nf-core modules test fasta/split nextflow_process { - name "Test Process FASTA_SPLITFASTA" + name "Test Process FASTA_SPLIT" script "../main.nf" - process "FASTA_SPLITFASTA" + process "FASTA_SPLIT" tag "modules" tag "modules_ensembl" tag "fasta" - tag "fasta/splitfasta" + tag "fasta/split" - def real_fa = new File("modules/ensembl/fasta/splitfasta/tests/data/real/in.fa").canonicalFile + def real_fa = new File("modules/ensembl/fasta/split/tests/data/real/in.fa").canonicalFile test("Stub outputs: default layout, no AGP") { diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap similarity index 100% rename from modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap rename to modules/ensembl/fasta/split/tests/main.nf.test.snap From 225b68abc6a0fc07398ddc77264e61c800179b3e Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 12 Feb 2026 13:46:37 +0000 Subject: [PATCH 15/36] Refactor for manifest input to recombine module --- modules/ensembl/fasta/recombine/main.nf | 16 ++++++------- .../data/agp/{splits => inputs}/part1.fa | 0 .../data/agp/{splits => inputs}/part2.fa | 0 .../recombine/tests/data/agp/manifest.txt | 2 ++ .../custom_regex/{splits => inputs}/seq1_1.fa | 0 .../custom_regex/{splits => inputs}/seq1_5.fa | 0 .../tests/data/custom_regex/manifest.txt | 2 ++ .../tests/data/extra_suffix/output/test.fa | 2 -- .../splits/seq1_chunk_start_1.fsa | 2 -- .../splits/seq1_chunk_start_5.fsa | 2 -- .../data/header/{output => inputs}/test.fa | 0 .../recombine/tests/data/header/manifest.txt | 1 + .../tests/data/order/inputs/01_second.fa | 2 ++ .../tests/data/order/inputs/02_first.fa | 2 ++ .../recombine/tests/data/order/manifest.txt | 2 ++ .../recombine/tests/data/order/output/test.fa | 4 ++++ .../fasta/recombine/tests/main.nf.test | 23 +++++++++---------- .../fasta/recombine/tests/main.nf.test.snap | 8 +++---- 18 files changed, 37 insertions(+), 31 deletions(-) rename modules/ensembl/fasta/recombine/tests/data/agp/{splits => inputs}/part1.fa (100%) rename modules/ensembl/fasta/recombine/tests/data/agp/{splits => inputs}/part2.fa (100%) create mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt rename modules/ensembl/fasta/recombine/tests/data/custom_regex/{splits => inputs}/seq1_1.fa (100%) rename modules/ensembl/fasta/recombine/tests/data/custom_regex/{splits => inputs}/seq1_5.fa (100%) create mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt delete mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa rename modules/ensembl/fasta/recombine/tests/data/header/{output => inputs}/test.fa (100%) create mode 100644 modules/ensembl/fasta/recombine/tests/data/header/manifest.txt create mode 100644 modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa create mode 100644 modules/ensembl/fasta/recombine/tests/data/order/manifest.txt create mode 100644 modules/ensembl/fasta/recombine/tests/data/order/output/test.fa diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 064558c..50a7560 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -24,7 +24,7 @@ process FASTA_RECOMBINE { publishDir "${params.outdir ?: '.'}", mode: 'copy' input: - tuple val(meta), path(fasta_dir), path(agp) + tuple val(meta), path(fasta_manifest), path(agp) output: tuple val(meta), path("*.fa"), emit: fasta @@ -32,10 +32,6 @@ process FASTA_RECOMBINE { script: def args = [] - if (params.extra_suffixes) { - args << "--extra-suffixes ${params.extra_suffixes}" - } - if (params.chunk_id_regex) { args << "--chunk-id-regex ${params.chunk_id_regex}" } @@ -45,14 +41,14 @@ process FASTA_RECOMBINE { } if (agp) { - args << "--agp-file '${agp}'" + args << "--agp-file ${agp}" } def out_fasta = "${meta.id}.fa" """ fasta_recombine \\ - --in-dir ${fasta_dir} \\ + --fasta-manifest ${fasta_manifest} \\ --out-fasta ${out_fasta} \\ ${args.join(' ')} """ @@ -65,12 +61,14 @@ process FASTA_RECOMBINE { out_fasta="${meta.id}.fa" + test -s "${fasta_manifest}" + mode="header" if [[ -n "${agp ?: ''}" ]]; then - MODE="agp" + mode="agp" fi - cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$OUT_FASTA" + cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$out_fasta" """ diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa b/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/agp/splits/part1.fa rename to modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa b/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/agp/splits/part2.fa rename to modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt new file mode 100644 index 0000000..b128cbe --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt @@ -0,0 +1,2 @@ +inputs/part1.fa +inputs/part2.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_1.fa rename to modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/custom_regex/splits/seq1_5.fa rename to modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt new file mode 100644 index 0000000..a125950 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt @@ -0,0 +1,2 @@ +inputs/seq1_1.fa +inputs/seq1_5.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa deleted file mode 100644 index 121d453..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/output/test.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -TTTTAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa deleted file mode 100644 index 17d88e1..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_1.fsa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1_chunk_start_1 -AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa b/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa deleted file mode 100644 index b6646f2..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/extra_suffix/splits/seq1_chunk_start_5.fsa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1_chunk_start_5 -CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/test.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/header/output/test.fa rename to modules/ensembl/fasta/recombine/tests/data/header/inputs/test.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt new file mode 100644 index 0000000..ee698b4 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt @@ -0,0 +1 @@ +inputs/test.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa b/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa new file mode 100644 index 0000000..d06c158 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa @@ -0,0 +1,2 @@ +>second second_record +TTTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa b/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa new file mode 100644 index 0000000..1e20e1f --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa @@ -0,0 +1,2 @@ +>first first_record +AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt new file mode 100644 index 0000000..dae8a10 --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt @@ -0,0 +1,2 @@ +inputs/02_first.fa +inputs/01_second.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa new file mode 100644 index 0000000..b3b6d1e --- /dev/null +++ b/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa @@ -0,0 +1,4 @@ +>first first_record +AAAA +>second second_record +TTTT diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index b965d38..2e06993 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -33,7 +33,8 @@ nextflow_process { process { """ - input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/splits'), []] + input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), []] + """ } } @@ -52,7 +53,7 @@ nextflow_process { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/splits'), + file('${moduleDir}/tests/data/agp/manifest.txt'), file('${moduleDir}/tests/data/agp/test.agp')] """ } @@ -69,7 +70,7 @@ nextflow_process { when { process { """ - input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/splits'), []] + input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), []] """ } } @@ -86,7 +87,7 @@ nextflow_process { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/splits'), + file('${moduleDir}/tests/data/agp/manifest.txt'), file('${moduleDir}/tests/data/agp/test.agp')] """ } @@ -98,15 +99,15 @@ nextflow_process { } - test("Real run: extra suffix support") { + test("Real run: custom chunk regex") { when { - params.extra_suffixes = ".fsa" + params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/extra_suffix/splits'), + file('${moduleDir}/tests/data/custom_regex/manifest.txt'), []] """ } @@ -118,15 +119,13 @@ nextflow_process { } - test("Real run: custom chunk regex") { + test("Real run: manifest order is preserved") { when { - params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' - process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/custom_regex/splits'), + file('${moduleDir}/tests/data/order/manifest.txt'), []] """ } @@ -135,6 +134,6 @@ nextflow_process { then { assert snapshot(process.out).match() } - } +} } diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index bf1e160..75786c9 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -67,7 +67,7 @@ }, "timestamp": "2026-02-10T15:17:40.786056" }, - "Real run: extra suffix support": { + "Real run: custom chunk regex": { "content": [ { "0": [ @@ -82,9 +82,9 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-02-10T15:17:47.991813" + "timestamp": "2026-02-10T15:17:49.822476" }, - "Real run: custom chunk regex": { + "Real run: manifest order is preserved": { "content": [ { "0": [ @@ -99,6 +99,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.3" }, - "timestamp": "2026-02-10T15:17:49.822476" + "timestamp": "2026-02-12T13:45:22.67052" } } \ No newline at end of file From 40ed5239e0e809933763fd4fb12900bb06f96a6f Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 13 Feb 2026 15:39:24 +0000 Subject: [PATCH 16/36] Various fixes --- .../ensembl/fasta/recombine/assets/NO_FILE | 0 modules/ensembl/fasta/recombine/main.nf | 13 +- .../{splits => inputs}/seq1_chunk_start_1.fa | 0 .../{splits => inputs}/seq1_chunk_start_5.fa | 0 .../data/header/{splits => inputs}/seq2.fa | 0 .../recombine/tests/data/header/manifest.txt | 4 +- .../data/header/{inputs => output}/test.fa | 0 .../fasta/recombine/tests/main.nf.test | 32 ++++- .../fasta/recombine/tests/main.nf.test.snap | 124 +++++++++++++----- modules/ensembl/fasta/split/main.nf | 10 +- .../fasta/split/tests/main.nf.test.snap | 32 ++--- 11 files changed, 150 insertions(+), 65 deletions(-) create mode 100644 modules/ensembl/fasta/recombine/assets/NO_FILE rename modules/ensembl/fasta/recombine/tests/data/header/{splits => inputs}/seq1_chunk_start_1.fa (100%) rename modules/ensembl/fasta/recombine/tests/data/header/{splits => inputs}/seq1_chunk_start_5.fa (100%) rename modules/ensembl/fasta/recombine/tests/data/header/{splits => inputs}/seq2.fa (100%) rename modules/ensembl/fasta/recombine/tests/data/header/{inputs => output}/test.fa (100%) diff --git a/modules/ensembl/fasta/recombine/assets/NO_FILE b/modules/ensembl/fasta/recombine/assets/NO_FILE new file mode 100644 index 0000000..e69de29 diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 50a7560..4e8e13a 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -21,8 +21,6 @@ process FASTA_RECOMBINE { conda "${moduleDir}/environment.yml" container "ensemblorg/ensembl-genomio:v1.6.1" - publishDir "${params.outdir ?: '.'}", mode: 'copy' - input: tuple val(meta), path(fasta_manifest), path(agp) @@ -33,14 +31,16 @@ process FASTA_RECOMBINE { def args = [] if (params.chunk_id_regex) { - args << "--chunk-id-regex ${params.chunk_id_regex}" + def rx = params.chunk_id_regex.replace("'", "'\"'\"'") + args << "--chunk-id-regex '${rx}'" } if (params.allow_revcomp) { args << "--allow-revcomp" } - if (agp) { + def has_agp = agp && agp.baseName != 'NO_FILE' + if (has_agp) { args << "--agp-file ${agp}" } @@ -64,10 +64,13 @@ process FASTA_RECOMBINE { test -s "${fasta_manifest}" mode="header" - if [[ -n "${agp ?: ''}" ]]; then + agp_path="${agp}" + agp_name="\${agp_path##*/}" + if [[ "\$agp_name" != "NO_FILE" ]]; then mode="agp" fi + cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$out_fasta" """ diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_1.fa rename to modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/header/splits/seq1_chunk_start_5.fa rename to modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/header/splits/seq2.fa rename to modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa diff --git a/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt index ee698b4..a34084d 100644 --- a/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt +++ b/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt @@ -1 +1,3 @@ -inputs/test.fa \ No newline at end of file +inputs/seq1_chunk_start_1.fa +inputs/seq1_chunk_start_5.fa +inputs/seq2.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/inputs/test.fa b/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa similarity index 100% rename from modules/ensembl/fasta/recombine/tests/data/header/inputs/test.fa rename to modules/ensembl/fasta/recombine/tests/data/header/output/test.fa diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index 2e06993..ba4bfc6 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -33,13 +33,18 @@ nextflow_process { process { """ - input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), []] + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/header/manifest.txt'), + file('${moduleDir}/assets/NO_FILE')] """ } } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } } @@ -60,6 +65,9 @@ nextflow_process { } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } } @@ -70,12 +78,17 @@ nextflow_process { when { process { """ - input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), []] + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/header/manifest.txt'), + file('${moduleDir}/assets/NO_FILE')] """ } } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } } @@ -94,6 +107,9 @@ nextflow_process { } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } } @@ -108,12 +124,15 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/custom_regex/manifest.txt'), - []] + file('${moduleDir}/assets/NO_FILE')] """ } } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } } @@ -126,14 +145,17 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/order/manifest.txt'), - []] + file('${moduleDir}/assets/NO_FILE')] """ } } then { + assert process.trace.tasks().size() == 1 + assert process.out.fasta.size() == 1 + assert process.success assert snapshot(process.out).match() } -} + } } diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 75786c9..30b0258 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -3,102 +3,162 @@ "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,3ec81eef9dd73dc86ff01621dbacc7a0" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,3ec81eef9dd73dc86ff01621dbacc7a0" + ] ] } ], + "timestamp": "2026-02-13T15:20:39.92005", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-10T15:17:42.590604" + } }, - "Real run: header recombination": { + "Real run: AGP recombination": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,5f81df5939251499ea60e666d8a306b3" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,5f81df5939251499ea60e666d8a306b3" + ] ] } ], + "timestamp": "2026-02-13T15:20:44.283073", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-10T15:17:44.398141" + } }, - "Real run: AGP recombination": { + "Real run: header recombination": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,709337303b43192f7647b77c170adac7" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,709337303b43192f7647b77c170adac7" + ] ] } ], + "timestamp": "2026-02-13T15:23:27.996406", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-10T15:17:46.212088" + } }, "Stub outputs: header mode": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,93d1870d020e197708753501e57db68f" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,93d1870d020e197708753501e57db68f" + ] ] } ], + "timestamp": "2026-02-13T15:20:37.864233", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-10T15:17:40.786056" + } }, "Real run: custom chunk regex": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,90d526c36d03ae9e226d09655f22f00e" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,90d526c36d03ae9e226d09655f22f00e" + ] ] } ], + "timestamp": "2026-02-13T15:26:49.012219", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-10T15:17:49.822476" + } }, "Real run: manifest order is preserved": { "content": [ { "0": [ - + [ + { + "id": "test" + }, + "test.fa:md5,52fa2054da674f0a5ebc263e724cf4a4" + ] ], "fasta": [ - + [ + { + "id": "test" + }, + "test.fa:md5,52fa2054da674f0a5ebc263e724cf4a4" + ] ] } ], + "timestamp": "2026-02-13T15:20:48.517972", "meta": { - "nf-test": "0.9.3", + "nf-test": "0.9.4", "nextflow": "25.10.3" - }, - "timestamp": "2026-02-12T13:45:22.67052" + } } } \ No newline at end of file diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index fd53d9f..5d2a347 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -21,14 +21,12 @@ process FASTA_SPLIT { conda "${moduleDir}/environment.yml" container "ensemblorg/ensembl-genomio:v1.6.1" - publishDir "${params.outdir ?: '.'}", mode: 'copy' - input: tuple val(meta), path(fasta) output: - tuple val(meta), path("**/*.fa"), emit: fasta - tuple val(meta), path("*.agp"), emit: agp, optional: true + tuple val(meta), path("splits/**/*.fa"), emit: fasta + tuple val(meta), path("splits/*.agp"), emit: agp, optional: true script: def args = [] @@ -72,7 +70,7 @@ process FASTA_SPLIT { """ fasta_split \\ --fasta-file ${fasta} \\ - --out-dir \$PWD \\ + --out-dir splits \\ ${args.join(' ')} """ @@ -93,7 +91,7 @@ process FASTA_SPLIT { cp -R "\$test_data_dir/splits/\$layout/." "splits/" if [[ "${params.write_agp ?: false}" == "true" ]]; then - cp "\$test_data_dir/agp/test.agp" "${meta.id}.agp" + cp "\$test_data_dir/agp/test.agp" "splits/${meta.id}.agp" fi """ diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index 7c44fbc..167ba7c 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -42,11 +42,11 @@ ] } ], + "timestamp": "2026-02-13T15:27:15.469156", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.04.6" - }, - "timestamp": "2026-02-03T11:44:20.723299027" + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } }, "Stub outputs: nested directory layout contract": { "content": [ @@ -81,11 +81,11 @@ ] } ], + "timestamp": "2026-02-13T15:27:19.735631", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.04.6" - }, - "timestamp": "2026-02-03T11:44:45.167257411" + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } }, "Stub outputs: default layout, no AGP": { "content": [ @@ -120,11 +120,11 @@ ] } ], + "timestamp": "2026-02-13T15:27:13.38194", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.04.6" - }, - "timestamp": "2026-02-03T11:44:08.447183258" + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } }, "Stub outputs: unique_file_names contract": { "content": [ @@ -159,10 +159,10 @@ ] } ], + "timestamp": "2026-02-13T15:27:17.614981", "meta": { - "nf-test": "0.9.3", - "nextflow": "25.04.6" - }, - "timestamp": "2026-02-03T11:44:33.225993321" + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } } } \ No newline at end of file From 410a94400a832dc42a4d4f72b9a8454695041a61 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 19 Feb 2026 00:12:48 +0000 Subject: [PATCH 17/36] Add repeats/combine_json module --- .../fasta/recombine => }/assets/NO_FILE | 0 modules/ensembl/fasta/recombine/main.nf | 3 +- .../fasta/recombine/tests/main.nf.test | 20 +-- .../fasta/recombine/tests/main.nf.test.snap | 36 ++-- modules/ensembl/fasta/split/main.nf | 2 +- .../ensembl/fasta/split/tests/main.nf.test | 22 +-- .../fasta/split/tests/main.nf.test.snap | 16 +- .../repeats/combine_json/environment.yml | 7 + modules/ensembl/repeats/combine_json/main.nf | 79 +++++++++ .../tests/data/agp/inputs/in.json | 34 ++++ .../combine_json/tests/data/agp/manifest.txt | 1 + .../tests/data/agp/output/test.repeat.json | 34 ++++ .../combine_json/tests/data/agp/test.agp | 1 + .../tests/data/custom_regex/inputs/in.json | 34 ++++ .../tests/data/custom_regex/manifest.txt | 1 + .../data/custom_regex/output/test.repeat.json | 34 ++++ .../tests/data/header/inputs/a.json | 34 ++++ .../tests/data/header/inputs/b.json | 34 ++++ .../tests/data/header/manifest.txt | 2 + .../tests/data/header/output/test.repeat.json | 43 +++++ .../tests/data/order/inputs/01.json | 34 ++++ .../tests/data/order/inputs/02.json | 34 ++++ .../tests/data/order/manifest.txt | 2 + .../tests/data/order/output/test.repeat.json | 43 +++++ .../repeats/combine_json/tests/main.nf.test | 153 ++++++++++++++++ .../combine_json/tests/main.nf.test.snap | 164 ++++++++++++++++++ 26 files changed, 817 insertions(+), 50 deletions(-) rename modules/{ensembl/fasta/recombine => }/assets/NO_FILE (100%) create mode 100644 modules/ensembl/repeats/combine_json/environment.yml create mode 100644 modules/ensembl/repeats/combine_json/main.nf create mode 100644 modules/ensembl/repeats/combine_json/tests/data/agp/inputs/in.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt create mode 100644 modules/ensembl/repeats/combine_json/tests/data/agp/output/test.repeat.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/agp/test.agp create mode 100644 modules/ensembl/repeats/combine_json/tests/data/custom_regex/inputs/in.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/custom_regex/manifest.txt create mode 100644 modules/ensembl/repeats/combine_json/tests/data/custom_regex/output/test.repeat.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/header/inputs/a.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/header/inputs/b.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt create mode 100644 modules/ensembl/repeats/combine_json/tests/data/header/output/test.repeat.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/order/inputs/01.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/order/inputs/02.json create mode 100644 modules/ensembl/repeats/combine_json/tests/data/order/manifest.txt create mode 100644 modules/ensembl/repeats/combine_json/tests/data/order/output/test.repeat.json create mode 100644 modules/ensembl/repeats/combine_json/tests/main.nf.test create mode 100644 modules/ensembl/repeats/combine_json/tests/main.nf.test.snap diff --git a/modules/ensembl/fasta/recombine/assets/NO_FILE b/modules/assets/NO_FILE similarity index 100% rename from modules/ensembl/fasta/recombine/assets/NO_FILE rename to modules/assets/NO_FILE diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 4e8e13a..9a7fd97 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -25,7 +25,7 @@ process FASTA_RECOMBINE { tuple val(meta), path(fasta_manifest), path(agp) output: - tuple val(meta), path("*.fa"), emit: fasta + tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta script: def args = [] @@ -74,6 +74,5 @@ process FASTA_RECOMBINE { cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$out_fasta" """ - } diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index ba4bfc6..ef81bd9 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -35,7 +35,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), - file('${moduleDir}/assets/NO_FILE')] + file('${projectDir}/modules/assets/NO_FILE')] """ } @@ -43,7 +43,7 @@ nextflow_process { then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } @@ -66,7 +66,7 @@ nextflow_process { then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } @@ -80,14 +80,14 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), - file('${moduleDir}/assets/NO_FILE')] + file('${projectDir}/modules/assets/NO_FILE')] """ } } then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } @@ -108,7 +108,7 @@ nextflow_process { then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } @@ -124,14 +124,14 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/custom_regex/manifest.txt'), - file('${moduleDir}/assets/NO_FILE')] + file('${projectDir}/modules/assets/NO_FILE')] """ } } then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } @@ -145,14 +145,14 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/order/manifest.txt'), - file('${moduleDir}/assets/NO_FILE')] + file('${projectDir}/modules/assets/NO_FILE')] """ } } then { assert process.trace.tasks().size() == 1 - assert process.out.fasta.size() == 1 + assert process.out.recombined_fasta.size() == 1 assert process.success assert snapshot(process.out).match() } diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 30b0258..3a27deb 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -10,7 +10,7 @@ "test.fa:md5,3ec81eef9dd73dc86ff01621dbacc7a0" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" @@ -20,7 +20,7 @@ ] } ], - "timestamp": "2026-02-13T15:20:39.92005", + "timestamp": "2026-02-18T23:12:05.089688", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -34,20 +34,20 @@ { "id": "test" }, - "test.fa:md5,5f81df5939251499ea60e666d8a306b3" + "test.fa:md5,f32bc79faea4bc05dd4675e0d4ededa1" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" }, - "test.fa:md5,5f81df5939251499ea60e666d8a306b3" + "test.fa:md5,f32bc79faea4bc05dd4675e0d4ededa1" ] ] } ], - "timestamp": "2026-02-13T15:20:44.283073", + "timestamp": "2026-02-18T23:12:09.601838", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -61,20 +61,20 @@ { "id": "test" }, - "test.fa:md5,709337303b43192f7647b77c170adac7" + "test.fa:md5,700550164316730d1145b7bde2ae3eb7" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" }, - "test.fa:md5,709337303b43192f7647b77c170adac7" + "test.fa:md5,700550164316730d1145b7bde2ae3eb7" ] ] } ], - "timestamp": "2026-02-13T15:23:27.996406", + "timestamp": "2026-02-18T23:12:07.342405", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -91,7 +91,7 @@ "test.fa:md5,93d1870d020e197708753501e57db68f" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" @@ -101,7 +101,7 @@ ] } ], - "timestamp": "2026-02-13T15:20:37.864233", + "timestamp": "2026-02-18T23:12:03.015143", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -115,20 +115,20 @@ { "id": "test" }, - "test.fa:md5,90d526c36d03ae9e226d09655f22f00e" + "test.fa:md5,a589b60028be69f01622a61cc78fa1ae" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" }, - "test.fa:md5,90d526c36d03ae9e226d09655f22f00e" + "test.fa:md5,a589b60028be69f01622a61cc78fa1ae" ] ] } ], - "timestamp": "2026-02-13T15:26:49.012219", + "timestamp": "2026-02-18T23:12:11.852053", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -145,7 +145,7 @@ "test.fa:md5,52fa2054da674f0a5ebc263e724cf4a4" ] ], - "fasta": [ + "recombined_fasta": [ [ { "id": "test" @@ -155,7 +155,7 @@ ] } ], - "timestamp": "2026-02-13T15:20:48.517972", + "timestamp": "2026-02-18T23:12:14.083842", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 5d2a347..845628b 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -25,7 +25,7 @@ process FASTA_SPLIT { tuple val(meta), path(fasta) output: - tuple val(meta), path("splits/**/*.fa"), emit: fasta + tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true script: diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test index cf4206f..37211ae 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -50,10 +50,10 @@ nextflow_process { assert snapshot(process.out).match() // fasta: tuple(meta, fa_paths) - assert process.out.fasta != null - assert process.out.fasta.size() == 1 + assert process.out.fastas != null + assert process.out.fastas.size() == 1 - def fasta_out = process.out.fasta[0] + def fasta_out = process.out.fastas[0] def meta = fasta_out[0] def fas = fasta_out[1] @@ -98,8 +98,8 @@ nextflow_process { then { assert snapshot(process.out).match() - assert process.out.fasta.size() == 1 - def fasta_out = process.out.fasta[0] + assert process.out.fastas.size() == 1 + def fasta_out = process.out.fastas[0] def fas = fasta_out[1] assert fas.size() == 2 @@ -146,7 +146,7 @@ nextflow_process { then { assert snapshot(process.out).match() - def fasta_out = process.out.fasta[0] + def fasta_out = process.out.fastas[0] def fas = fasta_out[1] assert fas.size() == 2 @@ -183,7 +183,7 @@ nextflow_process { then { assert snapshot(process.out).match() - def fastas = process.out.fasta[0][1] + def fastas = process.out.fastas[0][1] assert fastas.size() == 2 assert process.out.agp.size() == 0 @@ -218,10 +218,10 @@ nextflow_process { then { assert process.success - assert process.out.fasta != null - assert process.out.fasta.size() == 1 + assert process.out.fastas != null + assert process.out.fastas.size() == 1 - def out = process.out.fasta[0] + def out = process.out.fastas[0] def meta = out[0] def fas = out[1] @@ -303,7 +303,7 @@ nextflow_process { then { assert process.success - def fas = process.out.fasta[0][1] + def fas = process.out.fastas[0][1] assert fas.size() == 2 def merged = fas diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index 167ba7c..eb12321 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -29,7 +29,7 @@ "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" ] ], - "fasta": [ + "fastas": [ [ { "id": "test" @@ -42,7 +42,7 @@ ] } ], - "timestamp": "2026-02-13T15:27:15.469156", + "timestamp": "2026-02-18T23:21:51.036982", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -68,7 +68,7 @@ "agp": [ ], - "fasta": [ + "fastas": [ [ { "id": "test" @@ -81,7 +81,7 @@ ] } ], - "timestamp": "2026-02-13T15:27:19.735631", + "timestamp": "2026-02-18T23:06:24.284416", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -107,7 +107,7 @@ "agp": [ ], - "fasta": [ + "fastas": [ [ { "id": "test" @@ -120,7 +120,7 @@ ] } ], - "timestamp": "2026-02-13T15:27:13.38194", + "timestamp": "2026-02-18T23:06:18.00303", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -146,7 +146,7 @@ "agp": [ ], - "fasta": [ + "fastas": [ [ { "id": "test" @@ -159,7 +159,7 @@ ] } ], - "timestamp": "2026-02-13T15:27:17.614981", + "timestamp": "2026-02-18T23:06:22.194395", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/repeats/combine_json/environment.yml b/modules/ensembl/repeats/combine_json/environment.yml new file mode 100644 index 0000000..8bdd6b1 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/environment.yml @@ -0,0 +1,7 @@ +--- +name: "repeats_combine_json" +channels: + - conda-forge + - bioconda +dependencies: + - ensembl-genomio=1.6.1 \ No newline at end of file diff --git a/modules/ensembl/repeats/combine_json/main.nf b/modules/ensembl/repeats/combine_json/main.nf new file mode 100644 index 0000000..ff2177b --- /dev/null +++ b/modules/ensembl/repeats/combine_json/main.nf @@ -0,0 +1,79 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process REPEATS_COMBINE_JSON { + + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "ensemblorg/ensembl-genomio:v1.6.1" + + input: + tuple val(meta), path(json_manifest), path(agp) + + output: + tuple val(meta), path("${meta.id}.repeat.json"), emit: combined_json + + script: + def args = [] + + if (params.chunk_id_regex) { + def rx = params.chunk_id_regex.replace("'", "'\"'\"'") + args << "--chunk-id-regex '${rx}'" + } + + if (params.allow_revcomp) { + args << "--allow-revcomp" + } + + def has_agp = agp && agp.baseName != 'NO_FILE' + if (has_agp) { + args << "--agp-file ${agp}" + } + + def out_json = "${meta.id}.repeat.json" + + """ + python -m ensembl.io.genomio.repeats.combine_json \\ + --json-manifest ${json_manifest} \\ + --out-json ${out_json} \\ + ${args.join(' ')} + """ + + stub: + """ + set -euo pipefail + + test_data_dir="${moduleDir}/tests/data" + + out_json="${meta.id}.repeat.json" + + test -s "${json_manifest}" + + mode="header" + agp_path="${agp}" + agp_name="\${agp_path##*/}" + if [[ "\$agp_name" != "NO_FILE" ]]; then + mode="agp" + fi + + # Provide a schema-valid combined JSON fixture. + # Arrange fixtures under: + # tests/data/header/output/.repeat.json + # tests/data/agp/output/.repeat.json + cp "\$test_data_dir/\$mode/output/${meta.id}.repeat.json" "\$out_json" + """ +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/inputs/in.json b/modules/ensembl/repeats/combine_json/tests/data/agp/inputs/in.json new file mode 100644 index 0000000..8228fd3 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/agp/inputs/in.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "comp1", + "seq_region_start": 10, + "seq_region_end": 20, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 11, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt b/modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt new file mode 100644 index 0000000..1ac93e6 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt @@ -0,0 +1 @@ +inputs/in.json diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/output/test.repeat.json b/modules/ensembl/repeats/combine_json/tests/data/agp/output/test.repeat.json new file mode 100644 index 0000000..cfc4cd2 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/agp/output/test.repeat.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1", + "seq_region_start": 109, + "seq_region_end": 119, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 11, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/test.agp b/modules/ensembl/repeats/combine_json/tests/data/agp/test.agp new file mode 100644 index 0000000..86dddab --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/agp/test.agp @@ -0,0 +1 @@ +chr1 100 199 1 W comp1 1 100 + diff --git a/modules/ensembl/repeats/combine_json/tests/data/custom_regex/inputs/in.json b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/inputs/in.json new file mode 100644 index 0000000..69bfad7 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/inputs/in.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1_11", + "seq_region_start": 1, + "seq_region_end": 5, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 5, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/custom_regex/manifest.txt b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/manifest.txt new file mode 100644 index 0000000..1ac93e6 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/manifest.txt @@ -0,0 +1 @@ +inputs/in.json diff --git a/modules/ensembl/repeats/combine_json/tests/data/custom_regex/output/test.repeat.json b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/output/test.repeat.json new file mode 100644 index 0000000..0d8eff4 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/custom_regex/output/test.repeat.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1", + "seq_region_start": 11, + "seq_region_end": 15, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 5, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/header/inputs/a.json b/modules/ensembl/repeats/combine_json/tests/data/header/inputs/a.json new file mode 100644 index 0000000..b33f05c --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/header/inputs/a.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1_chunk_start_1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 3, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/header/inputs/b.json b/modules/ensembl/repeats/combine_json/tests/data/header/inputs/b.json new file mode 100644 index 0000000..e6787cb --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/header/inputs/b.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1_chunk_start_4", + "seq_region_start": 1, + "seq_region_end": 2, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 2, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt b/modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt new file mode 100644 index 0000000..419c5fd --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt @@ -0,0 +1,2 @@ +inputs/a.json +inputs/b.json diff --git a/modules/ensembl/repeats/combine_json/tests/data/header/output/test.repeat.json b/modules/ensembl/repeats/combine_json/tests/data/header/output/test.repeat.json new file mode 100644 index 0000000..c69532b --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/header/output/test.repeat.json @@ -0,0 +1,43 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 3, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + }, + { + "seq_region": "chr1", + "seq_region_start": 4, + "seq_region_end": 5, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 2, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/order/inputs/01.json b/modules/ensembl/repeats/combine_json/tests/data/order/inputs/01.json new file mode 100644 index 0000000..269ac0b --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/order/inputs/01.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr2_chunk_start_1", + "seq_region_start": 1, + "seq_region_end": 2, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 2, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/order/inputs/02.json b/modules/ensembl/repeats/combine_json/tests/data/order/inputs/02.json new file mode 100644 index 0000000..8256fd2 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/order/inputs/02.json @@ -0,0 +1,34 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr2_chunk_start_3", + "seq_region_start": 1, + "seq_region_end": 1, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 1, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/order/manifest.txt b/modules/ensembl/repeats/combine_json/tests/data/order/manifest.txt new file mode 100644 index 0000000..dad42b0 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/order/manifest.txt @@ -0,0 +1,2 @@ +inputs/02.json +inputs/01.json diff --git a/modules/ensembl/repeats/combine_json/tests/data/order/output/test.repeat.json b/modules/ensembl/repeats/combine_json/tests/data/order/output/test.repeat.json new file mode 100644 index 0000000..0442952 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/data/order/output/test.repeat.json @@ -0,0 +1,43 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "rm", + "display_label": "rm", + "description": "rm analysis (nf-test)", + "program": "stub", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "repeat_consensus": [ + { + "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", + "repeat_name": "Alu", + "repeat_class": "SINE", + "repeat_type": "Alu", + "repeat_consensus": "ACGT" + } + ], + "repeat_features": [ + { + "seq_region": "chr2", + "seq_region_start": 3, + "seq_region_end": 3, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 1, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + }, + { + "seq_region": "chr2", + "seq_region_start": 1, + "seq_region_end": 2, + "seq_region_strand": 1, + "repeat_start": 1, + "repeat_end": 2, + "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/main.nf.test b/modules/ensembl/repeats/combine_json/tests/main.nf.test new file mode 100644 index 0000000..5a6eff4 --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/main.nf.test @@ -0,0 +1,153 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// nf-core modules test repeats/combine_json +nextflow_process { + + name "Test Process REPEATS_COMBINE_JSON" + script "../main.nf" + process "REPEATS_COMBINE_JSON" + + tag "modules" + tag "modules_ensembl" + tag "repeats" + tag "repeats/combine_json" + + test("Stub outputs: header mode") { + + when { + options "-stub" + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/header/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Stub outputs: AGP mode") { + + when { + options "-stub" + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/agp/manifest.txt'), + file('${moduleDir}/tests/data/agp/test.agp')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: header combine + header-driven liftover") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/header/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: AGP-driven liftover") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/agp/manifest.txt'), + file('${moduleDir}/tests/data/agp/test.agp')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: custom chunk regex") { + + when { + params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/custom_regex/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: manifest order is preserved") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/order/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap b/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap new file mode 100644 index 0000000..b3fae3f --- /dev/null +++ b/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap @@ -0,0 +1,164 @@ +{ + "Real run: AGP-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:13.232239", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:08.721986", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: header combine + header-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:11.007889", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: header mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:06.662964", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: custom chunk regex": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:15.43463", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: manifest order is preserved": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.repeat.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ] + } + ], + "timestamp": "2026-02-19T00:11:17.627989", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + } +} \ No newline at end of file From 7bfe4c60450ea95001f20b3236974db2f25efe25 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Mon, 23 Feb 2026 19:22:51 +0000 Subject: [PATCH 18/36] Handle ncRNA features as well as repeats --- .../combine_json/environment.yml | 2 +- modules/ensembl/features/combine_json/main.nf | 116 +++++ .../tests/data/ncrna/agp/.DS_Store | Bin 0 -> 6148 bytes .../tests/data/ncrna/agp/inputs/in.json | 27 + .../tests/data/ncrna}/agp/manifest.txt | 0 .../data/ncrna/agp/output/test.features.json | 27 + .../tests/data/ncrna}/agp/test.agp | 0 .../tests/data/ncrna/custom_regex/.DS_Store | Bin 0 -> 6148 bytes .../data/ncrna/custom_regex/inputs/a.json | 27 + .../data/ncrna/custom_regex/inputs/b.json | 27 + .../data/ncrna/custom_regex}/manifest.txt | 0 .../custom_regex/output/test.features.json | 37 ++ .../tests/data/ncrna/header/.DS_Store | Bin 0 -> 6148 bytes .../tests/data/ncrna/header/inputs/a.json | 27 + .../tests/data/ncrna/header/inputs/b.json | 27 + .../tests/data/ncrna/header/manifest.txt | 2 + .../ncrna/header/output/test.features.json | 37 ++ .../tests/data/ncrna/order/.DS_Store | Bin 0 -> 6148 bytes .../tests/data/ncrna/order/inputs/01.json | 27 + .../tests/data/ncrna/order/inputs/02.json | 27 + .../tests/data/ncrna}/order/manifest.txt | 0 .../ncrna/order/output/test.features.json | 37 ++ .../tests/data/repeat}/agp/inputs/in.json | 0 .../tests/data/repeat/agp}/manifest.txt | 0 .../repeat/agp/output/test.features.json} | 0 .../tests/data/repeat/agp/test.agp | 1 + .../data/repeat}/custom_regex/inputs/in.json | 0 .../data/repeat/custom_regex/manifest.txt | 1 + .../custom_regex/output/test.features.json} | 0 .../tests/data/repeat}/header/inputs/a.json | 0 .../tests/data/repeat}/header/inputs/b.json | 0 .../tests/data/repeat/header/manifest.txt | 2 + .../repeat/header/output/test.features.json} | 0 .../tests/data/repeat}/order/inputs/01.json | 0 .../tests/data/repeat}/order/inputs/02.json | 0 .../tests/data/repeat/order/manifest.txt | 2 + .../repeat/order/output/test.features.json} | 0 .../features/combine_json/tests/main.nf.test | 280 ++++++++++ .../combine_json/tests/main.nf.test.snap | 488 ++++++++++++++++++ modules/ensembl/repeats/combine_json/main.nf | 79 --- .../repeats/combine_json/tests/main.nf.test | 153 ------ .../combine_json/tests/main.nf.test.snap | 164 ------ 42 files changed, 1220 insertions(+), 397 deletions(-) rename modules/ensembl/{repeats => features}/combine_json/environment.yml (73%) create mode 100644 modules/ensembl/features/combine_json/main.nf create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/ncrna}/agp/manifest.txt (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/ncrna}/agp/test.agp (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json rename modules/ensembl/{repeats/combine_json/tests/data/header => features/combine_json/tests/data/ncrna/custom_regex}/manifest.txt (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/header/.DS_Store create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/inputs/01.json create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/inputs/02.json rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/ncrna}/order/manifest.txt (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/output/test.features.json rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/agp/inputs/in.json (100%) rename modules/ensembl/{repeats/combine_json/tests/data/custom_regex => features/combine_json/tests/data/repeat/agp}/manifest.txt (100%) rename modules/ensembl/{repeats/combine_json/tests/data/agp/output/test.repeat.json => features/combine_json/tests/data/repeat/agp/output/test.features.json} (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/agp/test.agp rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/custom_regex/inputs/in.json (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/manifest.txt rename modules/ensembl/{repeats/combine_json/tests/data/custom_regex/output/test.repeat.json => features/combine_json/tests/data/repeat/custom_regex/output/test.features.json} (100%) rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/header/inputs/a.json (100%) rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/header/inputs/b.json (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/header/manifest.txt rename modules/ensembl/{repeats/combine_json/tests/data/header/output/test.repeat.json => features/combine_json/tests/data/repeat/header/output/test.features.json} (100%) rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/order/inputs/01.json (100%) rename modules/ensembl/{repeats/combine_json/tests/data => features/combine_json/tests/data/repeat}/order/inputs/02.json (100%) create mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/order/manifest.txt rename modules/ensembl/{repeats/combine_json/tests/data/order/output/test.repeat.json => features/combine_json/tests/data/repeat/order/output/test.features.json} (100%) create mode 100644 modules/ensembl/features/combine_json/tests/main.nf.test create mode 100644 modules/ensembl/features/combine_json/tests/main.nf.test.snap delete mode 100644 modules/ensembl/repeats/combine_json/main.nf delete mode 100644 modules/ensembl/repeats/combine_json/tests/main.nf.test delete mode 100644 modules/ensembl/repeats/combine_json/tests/main.nf.test.snap diff --git a/modules/ensembl/repeats/combine_json/environment.yml b/modules/ensembl/features/combine_json/environment.yml similarity index 73% rename from modules/ensembl/repeats/combine_json/environment.yml rename to modules/ensembl/features/combine_json/environment.yml index 8bdd6b1..5f1cb32 100644 --- a/modules/ensembl/repeats/combine_json/environment.yml +++ b/modules/ensembl/features/combine_json/environment.yml @@ -1,5 +1,5 @@ --- -name: "repeats_combine_json" +name: "features_combine_json" channels: - conda-forge - bioconda diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf new file mode 100644 index 0000000..18425e5 --- /dev/null +++ b/modules/ensembl/features/combine_json/main.nf @@ -0,0 +1,116 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +process FEATURES_COMBINE_JSON { + + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "ensemblorg/ensembl-genomio:v1.6.1" + + input: + tuple val(meta), path(json_manifest), path(agp) + + output: + tuple val(meta), path("${meta.id}.features.json"), emit: combined_json + + script: + def args = [] + + if (params.chunk_id_regex) { + def rx = params.chunk_id_regex.replace("'", "'\"'\"'") + args << "--chunk-id-regex '${rx}'" + } + + if (params.allow_revcomp) { + args << "--allow-revcomp" + } + + def has_agp = agp && agp.baseName != 'NO_FILE' + if (has_agp) { + args << "--agp-file '${agp}'" + } + + def out_json = "${meta.id}.features.json" + + """ + python -m ensembl.io.genomio.features.combine_json \\ + --json-manifest '${json_manifest}' \\ + --out-json '${out_json}' \\ + ${args.join(' ')} + """ + + stub: + """ + set -euo pipefail + + test_data_dir="${moduleDir}/tests/data" + + out_json="${meta.id}.features.json" + + test -s "${json_manifest}" + + mode="header" + agp_path="${agp}" + agp_name="\${agp_path##*/}" + if [[ "\$agp_name" != "NO_FILE" ]]; then + mode="agp" + fi + + manifest_real="\$(python -c 'from pathlib import Path; import sys; print(Path(sys.argv[1]).resolve())' "${json_manifest}")" + manifest_dir="\$(dirname "\$manifest_real")" + + first_json="\$(head -n 1 "${json_manifest}")" + if [[ -z "\$first_json" ]]; then + echo "ERROR: manifest is empty: ${json_manifest}" >&2 + exit 1 + fi + if [[ "\$first_json" != /* ]]; then + first_json="\${manifest_dir}/\${first_json}" + fi + if [[ ! -s "\$first_json" ]]; then + echo "ERROR: first JSON in manifest does not exist or is empty: \$first_json" >&2 + exit 1 + fi + + + if grep -q '"ncrna_features"' "\$first_json"; then + load_type="ncrna" + elif grep -q '"repeat_features"' "\$first_json"; then + load_type="repeat" + else + echo "ERROR: cannot detect load type from first JSON: \$first_json" >&2 + echo "Expected top-level key: 'repeat_features' or 'ncrna_features'." >&2 + exit 1 + fi + + # Provide a schema-valid combined JSON fixture. + # Fixtures are arranged under: + # tests/data/repeat/header/output/.features.json + # tests/data/repeat/agp/output/.features.json + # tests/data/ncrna/header/output/.features.json + # tests/data/ncrna/agp/output/.features.json + fixture="\$test_data_dir/\$load_type/\$mode/output/${meta.id}.features.json" + + if [[ ! -s "\$fixture" ]]; then + echo "ERROR: missing stub fixture: \$fixture" >&2 + echo "Make sure you created output fixture for meta.id='${meta.id}' under \$load_type/\$mode/output/." >&2 + exit 1 + fi + + cp "\$fixture" "\$out_json" + """ +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..46ebb6833f86b8f68ba1c38bb8339c81f2e59428 GIT binary patch literal 6148 zcmeHKQBK1!40Xzciul9hFNr(5>V37D*$i=vk3a~F9-B&0EUc75f+HkP@smoe8q4Y4!h5OA!Ab1aB{LT=E*C&d_!@v zI_zHD$puAgjR9j|$iRg@j=BEdpT7SOgY3x|Fb2+w0XJ%9%@nVcYisdxTx&h_0m{OD lNpT;7j+A2faw)!nioou<0}L6HA}kR55fB=zF$VsWfp4#8P9XpQ literal 0 HcmV?d00001 diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json new file mode 100644 index 0000000..d1701a4 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "comp1", + "seq_region_start": 10, + "seq_region_end": 20, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt similarity index 100% rename from modules/ensembl/repeats/combine_json/tests/data/agp/manifest.txt rename to modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json new file mode 100644 index 0000000..3479d90 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1", + "seq_region_start": 109, + "seq_region_end": 119, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/agp/test.agp b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp similarity index 100% rename from modules/ensembl/repeats/combine_json/tests/data/agp/test.agp rename to modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fe6726456fbd1a5c01fb943de0bb1315c8847206 GIT binary patch literal 6148 zcmeHKJ5Iwu5S>XZWRxZ)<%)tnH!zVoK`wwoB!U!UOCcSFkH9Io0Rb*U#&r&+1p7X5D>H5~e^NwdSCi z^xvcL)!}$IkI6?TkIo!b{hh}#pMD>>_-f?aM+Xk6^^Z;g%j(YF^}Jw z{e;54gX5&`#D!tB!9Xz3XJBlXQ>p(q$It)$ptuSKf`OA_fU|N@&hbcITL%v(wKhR- pp(5hf8ty~T(NYXwDa99171)z>fQ4gg2n!@O0!l+P!N8v~@C8=rP3-^x literal 0 HcmV?d00001 diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json new file mode 100644 index 0000000..82f7bb1 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1_1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json new file mode 100644 index 0000000..d6c2349 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1_4", + "seq_region_start": 1, + "seq_region_end": 2, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt similarity index 100% rename from modules/ensembl/repeats/combine_json/tests/data/header/manifest.txt rename to modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json new file mode 100644 index 0000000..995f408 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json @@ -0,0 +1,37 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + }, + { + "seq_region": "chr1", + "seq_region_start": 4, + "seq_region_end": 5, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/header/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..34af3ace395b1fabfb7621ad8be3aaa77d8fada9 GIT binary patch literal 6148 zcmeHKOHRW;47J+`1$EOU%UQAS4MG)8&pN5D_nT)q-eFL^)J&a*pN)k#W(HOxz-ataCikus}!J)b>)yl&nzy%=-9KlVX6ga#b$zNM2h<4=1%YL2sZU n;@27OL(tJu3|}e57f==0lXQTEV`m5pBsKy{Lo~s_pEB?TBx_B{ literal 0 HcmV?d00001 diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json new file mode 100644 index 0000000..8cee59d --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1_chunk_start_1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json new file mode 100644 index 0000000..a76e76e --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json @@ -0,0 +1,27 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1_chunk_start_4", + "seq_region_start": 1, + "seq_region_end": 2, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt new file mode 100644 index 0000000..cada44b --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt @@ -0,0 +1,2 @@ +inputs/a.json +inputs/b.json \ No newline at end of file diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json new file mode 100644 index 0000000..995f408 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json @@ -0,0 +1,37 @@ +{ + "analysis": { + "run_date": "2026-02-18T00:00:00Z", + "logic_name": "cmscan", + "display_label": "cmscan", + "description": "cmscan analysis", + "program": "test", + "program_version": "0.0" + }, + "source": { + "source_provider": "prov", + "is_primary": true + }, + "ncrna_tool": "cmscan", + "ncrna_features": [ + { + "seq_region": "chr1", + "seq_region_start": 1, + "seq_region_end": 3, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + }, + { + "seq_region": "chr1", + "seq_region_start": 4, + "seq_region_end": 5, + "seq_region_strand": 1, + "biotype": "miRNA", + "score": 1.0, + "target_name": "MIRTEST", + "is_significant": true + } + ] +} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..66ff2cf9ee6c96bb1913fed247a844fe61ae8b34 GIT binary patch literal 6148 zcmeHKJ5Iwu5S>XZWJHsaax41Wz(nQ*xc~~02vUMALHez!C^!Q*;0&}>oQF3b1v@69 zLnxY&X5V^tW<7pudv=M4XWQkJXhK9KRB&{P<_D2+(SeNIB7>}RJkrDZb~?XlHtoPW z4F8bwo^f4n`fUf(Z=KFeQy+Ew>GNtgn8G@65E z(tnS}7yIMgJf1!~d35I3G~am~^6BRx-$6()5DWwZN6!Fiwn%wq7;P{R37S8a)C43z${V*F6+4i2+zSW_ + params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' + + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/ncrna/custom_regex/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: repeat manifest order is preserved") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/repeat/order/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } + + test("Real run: ncRNA manifest order is preserved") { + + when { + process { + """ + input[0] = [[ id:'test' ], + file('${moduleDir}/tests/data/ncrna/order/manifest.txt'), + file('${projectDir}/modules/assets/NO_FILE')] + """ + } + } + + then { + assert process.trace.tasks().size() == 1 + assert process.out.combined_json.size() == 1 + assert process.success + assert snapshot(process.out).match() + } + } +} diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap new file mode 100644 index 0000000..e61eea6 --- /dev/null +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -0,0 +1,488 @@ +{ + "Real run: AGP-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-23T17:54:02.625791", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: ncRNA custom chunk regex": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ] + } + ], + "timestamp": "2026-02-23T19:15:58.553743", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: header combine + header-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T17:54:00.401674", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: repeat custom chunk regex": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ] + } + ], + "timestamp": "2026-02-23T18:38:58.589502", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: repeat manifest order is preserved": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ] + } + ], + "timestamp": "2026-02-23T18:39:03.129965", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: ncRNA header mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" + ] + ] + } + ], + "timestamp": "2026-02-23T19:21:33.771238", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: ncRNA header combine + header-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ] + } + ], + "timestamp": "2026-02-23T19:15:49.744214", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: ncRNA AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,435c4d8f4008e57685ff951bbe81df0e" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,435c4d8f4008e57685ff951bbe81df0e" + ] + ] + } + ], + "timestamp": "2026-02-23T19:21:38.112104", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: ncRNA manifest order is preserved": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,c5b36cf499f0d111684f91372469154f" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,c5b36cf499f0d111684f91372469154f" + ] + ] + } + ], + "timestamp": "2026-02-23T19:16:02.962026", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: repeat AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-23T19:21:35.954494", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: AGP mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-23T17:53:58.199351", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: header mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T17:53:56.112251", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: ncRNA AGP-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,837dcba57ebd00c1b8adbce528b8f1b0" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,837dcba57ebd00c1b8adbce528b8f1b0" + ] + ] + } + ], + "timestamp": "2026-02-23T19:15:54.146861", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: repeat header combine + header-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T18:38:49.606314", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: custom chunk regex": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" + ] + ] + } + ], + "timestamp": "2026-02-23T17:54:04.861554", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: repeat header mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T19:21:31.584701", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: manifest order is preserved": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,1b68c1371265dad11839769a5e776b33" + ] + ] + } + ], + "timestamp": "2026-02-23T17:54:07.074875", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Real run: repeat AGP-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + ] + ] + } + ], + "timestamp": "2026-02-23T18:38:54.140158", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + } +} \ No newline at end of file diff --git a/modules/ensembl/repeats/combine_json/main.nf b/modules/ensembl/repeats/combine_json/main.nf deleted file mode 100644 index ff2177b..0000000 --- a/modules/ensembl/repeats/combine_json/main.nf +++ /dev/null @@ -1,79 +0,0 @@ -// See the NOTICE file distributed with this work for additional information -// regarding copyright ownership. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -process REPEATS_COMBINE_JSON { - - tag "${meta.id}" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "ensemblorg/ensembl-genomio:v1.6.1" - - input: - tuple val(meta), path(json_manifest), path(agp) - - output: - tuple val(meta), path("${meta.id}.repeat.json"), emit: combined_json - - script: - def args = [] - - if (params.chunk_id_regex) { - def rx = params.chunk_id_regex.replace("'", "'\"'\"'") - args << "--chunk-id-regex '${rx}'" - } - - if (params.allow_revcomp) { - args << "--allow-revcomp" - } - - def has_agp = agp && agp.baseName != 'NO_FILE' - if (has_agp) { - args << "--agp-file ${agp}" - } - - def out_json = "${meta.id}.repeat.json" - - """ - python -m ensembl.io.genomio.repeats.combine_json \\ - --json-manifest ${json_manifest} \\ - --out-json ${out_json} \\ - ${args.join(' ')} - """ - - stub: - """ - set -euo pipefail - - test_data_dir="${moduleDir}/tests/data" - - out_json="${meta.id}.repeat.json" - - test -s "${json_manifest}" - - mode="header" - agp_path="${agp}" - agp_name="\${agp_path##*/}" - if [[ "\$agp_name" != "NO_FILE" ]]; then - mode="agp" - fi - - # Provide a schema-valid combined JSON fixture. - # Arrange fixtures under: - # tests/data/header/output/.repeat.json - # tests/data/agp/output/.repeat.json - cp "\$test_data_dir/\$mode/output/${meta.id}.repeat.json" "\$out_json" - """ -} diff --git a/modules/ensembl/repeats/combine_json/tests/main.nf.test b/modules/ensembl/repeats/combine_json/tests/main.nf.test deleted file mode 100644 index 5a6eff4..0000000 --- a/modules/ensembl/repeats/combine_json/tests/main.nf.test +++ /dev/null @@ -1,153 +0,0 @@ -// See the NOTICE file distributed with this work for additional information -// regarding copyright ownership. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// nf-core modules test repeats/combine_json -nextflow_process { - - name "Test Process REPEATS_COMBINE_JSON" - script "../main.nf" - process "REPEATS_COMBINE_JSON" - - tag "modules" - tag "modules_ensembl" - tag "repeats" - tag "repeats/combine_json" - - test("Stub outputs: header mode") { - - when { - options "-stub" - - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/header/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Stub outputs: AGP mode") { - - when { - options "-stub" - - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/manifest.txt'), - file('${moduleDir}/tests/data/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: header combine + header-driven liftover") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/header/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: AGP-driven liftover") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/manifest.txt'), - file('${moduleDir}/tests/data/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: custom chunk regex") { - - when { - params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' - - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/custom_regex/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: manifest order is preserved") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/order/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } -} diff --git a/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap b/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap deleted file mode 100644 index b3fae3f..0000000 --- a/modules/ensembl/repeats/combine_json/tests/main.nf.test.snap +++ /dev/null @@ -1,164 +0,0 @@ -{ - "Real run: AGP-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:13.232239", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: AGP mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:08.721986", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: header combine + header-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:11.007889", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: header mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:06.662964", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: custom chunk regex": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:15.43463", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: manifest order is preserved": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.repeat.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ] - } - ], - "timestamp": "2026-02-19T00:11:17.627989", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - } -} \ No newline at end of file From e5bdeb2cd2d5183190e28979222a3263b85f7911 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Mon, 23 Feb 2026 23:33:22 +0000 Subject: [PATCH 19/36] Naming update --- modules/ensembl/features/combine_json/main.nf | 6 +- .../ncrna/{header => seq_region}/.DS_Store | Bin .../{header => seq_region}/inputs/a.json | 0 .../{header => seq_region}/inputs/b.json | 0 .../ncrna/{header => seq_region}/manifest.txt | 0 .../output/test.features.json | 0 .../{header => seq_region}/inputs/a.json | 0 .../{header => seq_region}/inputs/b.json | 0 .../{header => seq_region}/manifest.txt | 0 .../output/test.features.json | 0 .../features/combine_json/tests/main.nf.test | 16 +-- .../combine_json/tests/main.nf.test.snap | 108 ++++++++++++++++++ 12 files changed, 119 insertions(+), 11 deletions(-) rename modules/ensembl/features/combine_json/tests/data/ncrna/{header => seq_region}/.DS_Store (100%) rename modules/ensembl/features/combine_json/tests/data/ncrna/{header => seq_region}/inputs/a.json (100%) rename modules/ensembl/features/combine_json/tests/data/ncrna/{header => seq_region}/inputs/b.json (100%) rename modules/ensembl/features/combine_json/tests/data/ncrna/{header => seq_region}/manifest.txt (100%) rename modules/ensembl/features/combine_json/tests/data/ncrna/{header => seq_region}/output/test.features.json (100%) rename modules/ensembl/features/combine_json/tests/data/repeat/{header => seq_region}/inputs/a.json (100%) rename modules/ensembl/features/combine_json/tests/data/repeat/{header => seq_region}/inputs/b.json (100%) rename modules/ensembl/features/combine_json/tests/data/repeat/{header => seq_region}/manifest.txt (100%) rename modules/ensembl/features/combine_json/tests/data/repeat/{header => seq_region}/output/test.features.json (100%) diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index 18425e5..d895040 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -63,7 +63,7 @@ process FEATURES_COMBINE_JSON { test -s "${json_manifest}" - mode="header" + mode="seq_region" agp_path="${agp}" agp_name="\${agp_path##*/}" if [[ "\$agp_name" != "NO_FILE" ]]; then @@ -99,9 +99,9 @@ process FEATURES_COMBINE_JSON { # Provide a schema-valid combined JSON fixture. # Fixtures are arranged under: - # tests/data/repeat/header/output/.features.json + # tests/data/repeat/seq_region/output/.features.json # tests/data/repeat/agp/output/.features.json - # tests/data/ncrna/header/output/.features.json + # tests/data/ncrna/seq_region/output/.features.json # tests/data/ncrna/agp/output/.features.json fixture="\$test_data_dir/\$load_type/\$mode/output/${meta.id}.features.json" diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/.DS_Store similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/ncrna/header/.DS_Store rename to modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/.DS_Store diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/a.json rename to modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/ncrna/header/inputs/b.json rename to modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/ncrna/header/manifest.txt rename to modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/ncrna/header/output/test.features.json rename to modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/header/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/repeat/header/inputs/a.json rename to modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/header/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/repeat/header/inputs/b.json rename to modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/header/manifest.txt b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/repeat/header/manifest.txt rename to modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/header/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json similarity index 100% rename from modules/ensembl/features/combine_json/tests/data/repeat/header/output/test.features.json rename to modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test index a7490d8..360aa79 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -25,7 +25,7 @@ nextflow_process { tag "features" tag "features/combine_json" - test("Stub outputs: repeat header mode") { + test("Stub outputs: repeat seq_region mode") { when { options "-stub" @@ -33,7 +33,7 @@ nextflow_process { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/header/manifest.txt'), + file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), file('${projectDir}/modules/assets/NO_FILE')] """ } @@ -47,7 +47,7 @@ nextflow_process { } } - test("Stub outputs: ncRNA header mode") { + test("Stub outputs: ncRNA seq_region mode") { when { options "-stub" @@ -55,7 +55,7 @@ nextflow_process { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/header/manifest.txt'), + file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), file('${projectDir}/modules/assets/NO_FILE')] """ } @@ -113,13 +113,13 @@ nextflow_process { } } - test("Real run: repeat header combine + header-driven liftover") { + test("Real run: repeat seq_region combine + seq_region-driven liftover") { when { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/header/manifest.txt'), + file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), file('${projectDir}/modules/assets/NO_FILE')] """ } @@ -133,13 +133,13 @@ nextflow_process { } } - test("Real run: ncRNA header combine + header-driven liftover") { + test("Real run: ncRNA seq_region combine + seq_region-driven liftover") { when { process { """ input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/header/manifest.txt'), + file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), file('${projectDir}/modules/assets/NO_FILE')] """ } diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index e61eea6..9ff7391 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -161,6 +161,60 @@ "nextflow": "25.10.3" } }, + "Real run: repeat seq_region combine + seq_region-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T23:31:17.929825", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, + "Stub outputs: ncRNA seq_region mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" + ] + ] + } + ], + "timestamp": "2026-02-23T23:32:28.865106", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, "Real run: ncRNA header combine + header-driven liftover": { "content": [ { @@ -188,6 +242,33 @@ "nextflow": "25.10.3" } }, + "Real run: ncRNA seq_region combine + seq_region-driven liftover": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + ] + ] + } + ], + "timestamp": "2026-02-23T23:31:20.204864", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, "Stub outputs: ncRNA AGP mode": { "content": [ { @@ -458,6 +539,33 @@ "nextflow": "25.10.3" } }, + "Stub outputs: repeat seq_region mode": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ], + "combined_json": [ + [ + { + "id": "test" + }, + "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + ] + ] + } + ], + "timestamp": "2026-02-23T23:32:26.754167", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.3" + } + }, "Real run: repeat AGP-driven liftover": { "content": [ { From fda3137afa5f3992c23db47656e07ccf056e66ec Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Mon, 2 Mar 2026 23:23:36 +0000 Subject: [PATCH 20/36] Add version.yml to output --- modules/ensembl/fasta/recombine/main.nf | 10 ++++++++++ .../ensembl/fasta/recombine/tests/main.nf.test | 8 ++++---- modules/ensembl/fasta/split/main.nf | 11 +++++++++++ modules/ensembl/features/combine_json/main.nf | 13 ++++++++++++- .../features/combine_json/tests/main.nf.test | 16 ++++++++-------- 5 files changed, 45 insertions(+), 13 deletions(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 9a7fd97..01b53dc 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -26,6 +26,7 @@ process FASTA_RECOMBINE { output: tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta + path "versions.yml", emit: versions script: def args = [] @@ -51,6 +52,11 @@ process FASTA_RECOMBINE { --fasta-manifest ${fasta_manifest} \\ --out-fasta ${out_fasta} \\ ${args.join(' ')} + + cat <<-END_VERSIONS > versions.yml + ${task.process}: + fasta_recombine: $(fasta_recombine --version 2>/dev/null | head -n 1) + END_VERSIONS """ stub: @@ -73,6 +79,10 @@ process FASTA_RECOMBINE { cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$out_fasta" + cat <<-END_VERSIONS > versions.yml + ${task.process}: + fasta_recombine: $(fasta_recombine --version 2>/dev/null | head -n 1) + END_VERSIONS """ } diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index ef81bd9..91844ba 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -35,7 +35,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } @@ -80,7 +80,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/header/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -124,7 +124,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/custom_regex/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -145,7 +145,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/order/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 845628b..8b5a845 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -27,6 +27,7 @@ process FASTA_SPLIT { output: tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true + path "versions.yml", emit: versions script: def args = [] @@ -72,6 +73,11 @@ process FASTA_SPLIT { --fasta-file ${fasta} \\ --out-dir splits \\ ${args.join(' ')} + + cat <<-END_VERSIONS > versions.yml + ${task.process}: + fasta_split: $(fasta_split --version 2>/dev/null | head -n 1) + END_VERSIONS """ stub: @@ -93,6 +99,11 @@ process FASTA_SPLIT { if [[ "${params.write_agp ?: false}" == "true" ]]; then cp "\$test_data_dir/agp/test.agp" "splits/${meta.id}.agp" fi + + cat <<-END_VERSIONS > versions.yml + ${task.process}: + fasta_split: $(fasta_split --version 2>/dev/null | head -n 1) + END_VERSIONS """ diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index d895040..6d92340 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -26,6 +26,7 @@ process FEATURES_COMBINE_JSON { output: tuple val(meta), path("${meta.id}.features.json"), emit: combined_json + path "versions.yml", emit: versions script: def args = [] @@ -47,10 +48,15 @@ process FEATURES_COMBINE_JSON { def out_json = "${meta.id}.features.json" """ - python -m ensembl.io.genomio.features.combine_json \\ + features_combine_json \\ --json-manifest '${json_manifest}' \\ --out-json '${out_json}' \\ ${args.join(' ')} + + cat <<-END_VERSIONS > versions.yml + ${task.process}: + features_combine_json: $(features_combine_json --version 2>/dev/null | head -n 1) + END_VERSIONS """ stub: @@ -112,5 +118,10 @@ process FEATURES_COMBINE_JSON { fi cp "\$fixture" "\$out_json" + + cat <<-END_VERSIONS > versions.yml + ${task.process}: + features_combine_json: $(features_combine_json --version 2>/dev/null | head -n 1) + END_VERSIONS """ } diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test index 360aa79..153f75b 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -34,7 +34,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -56,7 +56,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -120,7 +120,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -140,7 +140,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -202,7 +202,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/repeat/custom_regex/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -225,7 +225,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/ncrna/custom_regex/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -245,7 +245,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/repeat/order/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } @@ -265,7 +265,7 @@ nextflow_process { """ input[0] = [[ id:'test' ], file('${moduleDir}/tests/data/ncrna/order/manifest.txt'), - file('${projectDir}/modules/assets/NO_FILE')] + file('${workflow.projectDir}/modules/assets/NO_FILE')] """ } } From b1147731de77ff495a8555a4cb64e6d1dbd616fc Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Wed, 11 Mar 2026 09:34:57 +0000 Subject: [PATCH 21/36] Remove outdated files --- modules/ensembl/fasta/split/main.nf | 13 +- .../ensembl/fasta/splitfasta/environment.yml | 8 - modules/ensembl/fasta/splitfasta/main.nf | 106 ---- .../ensembl/fasta/splitfasta/split_fasta.py | 462 ------------------ .../fasta/splitfasta/tests/data/agp/test.agp | 4 - .../fasta/splitfasta/tests/data/real/in.fa | 6 - .../tests/data/splits/default/0/test.1.fa | 4 - .../tests/data/splits/default/0/test.2.fa | 2 - .../tests/data/splits/multi_dir/0/0/test.1.fa | 2 - .../tests/data/splits/multi_dir/0/1/test.2.fa | 2 - .../tests/data/splits/unique/0/test.0.1.fa | 2 - .../tests/data/splits/unique/0/test.0.2.fa | 2 - .../fasta/splitfasta/tests/main.nf.test | 301 ------------ .../fasta/splitfasta/tests/main.nf.test.snap | 168 ------- tests/config/nextflow.config | 2 +- tests/test_split_fasta.py | 144 ------ 16 files changed, 13 insertions(+), 1215 deletions(-) delete mode 100644 modules/ensembl/fasta/splitfasta/environment.yml delete mode 100644 modules/ensembl/fasta/splitfasta/main.nf delete mode 100644 modules/ensembl/fasta/splitfasta/split_fasta.py delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/real/in.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa delete mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test delete mode 100644 modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap delete mode 100644 tests/test_split_fasta.py diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 8b5a845..8cc900b 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,6 +13,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +def fasta_split_mem(longest_seq_bp) { + if( !longest_seq_bp || longest_seq_bp <= 0 ) return 8.GB + + // Heuristic: ~2.5 bytes/base peak => ~1 GB per 400 Mbp of the *longest* sequence + // Add 2GB base memory to account for overhead + def mem_gb = 2 + Math.ceil(longest_seq_bp as double / 400_000_000d) + return mem_gb.GB +} + process FASTA_SPLIT { tag "${meta.id}" @@ -21,8 +30,10 @@ process FASTA_SPLIT { conda "${moduleDir}/environment.yml" container "ensemblorg/ensembl-genomio:v1.6.1" + memory { fasta_split_mem(longest_seq_bp) } + input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta), val(longest_seq_bp) output: tuple val(meta), path("splits/**/*.fa"), emit: fastas diff --git a/modules/ensembl/fasta/splitfasta/environment.yml b/modules/ensembl/fasta/splitfasta/environment.yml deleted file mode 100644 index 759f3da..0000000 --- a/modules/ensembl/fasta/splitfasta/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -name: "fasta_splitfasta" -channels: - - conda-forge - - bioconda -dependencies: - - python=3.11.7 - - biopython=1.86 \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/main.nf b/modules/ensembl/fasta/splitfasta/main.nf deleted file mode 100644 index 0a8b761..0000000 --- a/modules/ensembl/fasta/splitfasta/main.nf +++ /dev/null @@ -1,106 +0,0 @@ -// See the NOTICE file distributed with this work for additional information -// regarding copyright ownership. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -process FASTA_SPLITFASTA { - - tag "${meta.id}" - label 'process_low' - - publishDir "${params.outdir ?: '.'}", mode: 'copy' - - input: - tuple val(meta), path(fasta) - - output: - tuple val(meta), path("**/*.fa"), emit: fasta - tuple val(meta), path("*.agp"), emit: agp, optional: true - - script: - def args = [] - - if (params.max_seqs_per_file) { - args << "--max-seqs-per-file ${params.max_seqs_per_file}" - } - - if (params.max_seq_length_per_file) { - args << "--max-seq-length-per-file ${params.max_seq_length_per_file}" - } - - if (params.min_chunk_length) { - args << "--min-chunk-length ${params.min_chunk_length}" - } - - if (params.max_files_per_directory) { - args << "--max-files-per-directory ${params.max_files_per_directory}" - } - - if (params.max_dirs_per_directory) { - args << "--max-dirs-per-directory ${params.max_dirs_per_directory}" - } - - if (params.force_max_seq_length) { - args << "--force-max-seq-length" - } - - if (params.write_agp) { - args << "--write-agp" - } - - if (params.unique_file_names) { - args << "--unique-file-names" - } - - if (params.delete_existing_files) { - args << "--delete-existing-files" - } - - """ - python \\ - ${moduleDir}/split_fasta.py \\ - --fasta-file \$PWD/${fasta} \\ - --out-dir \$PWD \\ - ${args.join(' ')} - """ - - stub: - """ - set -euo pipefail - - FIXTURE_DIR="${moduleDir}/tests/data" - - LAYOUT="default" - if [[ "${params.unique_file_names ?: false}" == "true" ]]; then - LAYOUT="unique" - elif [[ -n "${params.max_dirs_per_directory ?: ''}" || -n "${params.max_files_per_directory ?: ''}" ]]; then - LAYOUT="multi_dir" - fi - - mkdir -p splits - cp -R "\$FIXTURE_DIR/splits/\$LAYOUT/." "splits/" - - find splits -type f -name 'test*.fa' | while read -r f; do - bn=\$(basename "\$f") - dir=\$(dirname "\$f") - new_bn="\${bn/test/${meta.id}}" - mv "\$f" "\${dir}/\${new_bn}" - done - - if [[ "${params.write_agp ?: false}" == "true" ]]; then - cp "\$FIXTURE_DIR/agp/test.agp" "${meta.id}.agp" - fi - """ - - -} diff --git a/modules/ensembl/fasta/splitfasta/split_fasta.py b/modules/ensembl/fasta/splitfasta/split_fasta.py deleted file mode 100644 index 164ec44..0000000 --- a/modules/ensembl/fasta/splitfasta/split_fasta.py +++ /dev/null @@ -1,462 +0,0 @@ -#!/usr/bin/env python3 - -"""Split a FASTA file (possibly gzipped) into multiple smaller FASTA files.""" - -import inspect -import logging -import shutil -from pathlib import Path -from typing import Optional, List, Set, Tuple - -from Bio import SeqIO -from Bio.SeqRecord import SeqRecord - -try: - from ensembl.utils.archive import open_gz_file # type: ignore -except ImportError: - import gzip - - def open_gz_file(path): - p = str(path) - return gzip.open(p, "rt") if p.endswith(".gz") else open(p, "rt") - - -try: - from ensembl.utils.argparse import ArgumentParser # type: ignore -except ImportError: - from argparse import ArgumentParser - -try: - from ensembl.utils.logging import init_logging_with_args # type: ignore -except ImportError: - import logging - - def init_logging_with_args(args): - level = getattr(args, "log_level", "INFO") - logging.basicConfig(level=level) - - -class Params: - """Class to hold parameters for splitting FASTA files.""" - - def __init__( - self, - fasta_file: Path, - out_dir: Optional[Path] = None, - write_agp: bool = False, - max_seqs_per_file: Optional[int] = None, - max_seq_length_per_file: Optional[int] = None, - min_chunk_length: Optional[int] = None, - max_files_per_directory: Optional[int] = None, - max_dirs_per_directory: Optional[int] = None, - delete_existing_files: bool = False, - unique_file_names: bool = False, - delete_original_file: bool = False, - force_max_seq_length: bool = False, - ): - self.fasta_file = fasta_file - self.out_dir = out_dir if out_dir is not None else fasta_file.parent - self.write_agp = write_agp - self.max_seqs_per_file = max_seqs_per_file - self.max_seq_length_per_file = max_seq_length_per_file - self.min_chunk_length = min_chunk_length - self.max_files_per_directory = max_files_per_directory - self.max_dirs_per_directory = max_dirs_per_directory - self.delete_existing_files = delete_existing_files - self.unique_file_names = unique_file_names - self.delete_original_file = delete_original_file - self.force_max_seq_length = force_max_seq_length - - self._validate_params() - - def _validate_params(self) -> None: - if self.max_dirs_per_directory is not None and self.max_dirs_per_directory <= 0: - raise ValueError("--max-dirs-per-directory must be > 0 or None") - if ( - self.max_files_per_directory is not None - and self.max_files_per_directory <= 0 - ): - raise ValueError("--max-files-per-directory must be > 0 or None") - if self.max_seqs_per_file is not None and self.max_seqs_per_file <= 0: - raise ValueError("--max-seqs-per-file must be > 0 or None") - if ( - self.max_seq_length_per_file is not None - and self.max_seq_length_per_file <= 0 - ): - raise ValueError("--max-seq-length-per-file must be > 0 or None") - if self.min_chunk_length is not None: - if self.max_seq_length_per_file is None: - raise ValueError( - "--min-chunk-length requires --max-seq-length-per-file" - ) - if self.min_chunk_length <= 0: - raise ValueError("--min-chunk-length must be > 0") - - -class OutputWriter: - """ - Manages output file creation and counters, writing in a single pass. - Creates/cleans directories lazily as required. - """ - - def __init__(self, params: Params): - self.params = params - self.basename = ( - params.fasta_file.name.removesuffix(".gz") - .removesuffix(".fa") - .removesuffix(".fasta") - ) - self.agp_file = ( - self.params.out_dir.joinpath(self.basename + ".agp") - if params.write_agp - else None - ) - self.file_count = 0 - self.record_count = 0 - self.file_len = 0 - self._fh = None - self._agp_fh = None - self._cleaned_dirs: Set[Path] = set() - - self.open_new_file() - - def _create_or_clean_dir(self, dir_path: Path) -> None: - try: - dir_path.mkdir(parents=True, exist_ok=True) - if self.params.delete_existing_files and dir_path not in self._cleaned_dirs: - for child in dir_path.iterdir(): - if child.is_dir(): - shutil.rmtree(child) - else: - child.unlink() - self._cleaned_dirs.add(dir_path) - except Exception: - logging.exception("Failed to prepare output directory '%s'", dir_path) - raise - - def _get_subdir_path(self, dir_index: int) -> Path: - """Computes subdirectory path based on dir_index and max_dirs_per_directory.""" - parts = [] - max_dirs = self.params.max_dirs_per_directory - if max_dirs is None: - parts.append("1") - else: - current_index = dir_index - while current_index >= 0: - parts.append(f"{current_index % max_dirs}") - current_index = current_index // max_dirs - 1 - - parts.reverse() - return self.params.out_dir.joinpath(*parts) - - def _get_file_and_dir_index(self) -> Tuple[int, int]: - """ - Determines index of file and directory based on file count and max files per directory. - Returns (file_index, dir_index). - """ - max_files = self.params.max_files_per_directory - if max_files is None: - return self.file_count, 0 - adjusted_count = self.file_count - 1 - return (adjusted_count % max_files + 1, adjusted_count // max_files) - - def _get_path_for_next_file(self) -> Path: - """Computes path for the next output file.""" - self.file_count += 1 - file_index, dir_index = self._get_file_and_dir_index() - subdir_path = self._get_subdir_path(dir_index) - self._create_or_clean_dir(subdir_path) - - if self.params.unique_file_names: - file_name = f"{self.basename}.{dir_index}.{file_index}.fa" - else: - file_name = f"{self.basename}.{file_index}.fa" - return subdir_path.joinpath(file_name) - - def add_agp_entry( - self, - object_id: str, - start: int, - end: int, - part_nr: int, - part_id: str, - part_length: int, - ) -> None: - """Adds an entry to the AGP file.""" - # AGP columns for WGS contig component type: - # object, object_beg, object_end, part_number, component_type, - # component_id, component_beg, component_end, orientation - if self._agp_fh is None: - return - try: - line = f"{object_id}\t{start}\t{end}\t{part_nr}\tW\t{part_id}\t1\t{part_length}\t+\n" - self._agp_fh.write(line) - except Exception: - logging.exception("Failed to write AGP entry for part '%s'", part_id) - raise - - def create_agp_file(self) -> None: - """Creates the AGP file for recording sequence chunking.""" - if self.agp_file is None: - return - try: - self.params.out_dir.mkdir(parents=True, exist_ok=True) - self._agp_fh = open(self.agp_file, "w") - self._agp_fh.write("# AGP-version 2.0\n") - logging.info("Created AGP file '%s'", self.agp_file) - except Exception: - logging.exception("Failed to open AGP file '%s'", self.agp_file) - raise - - def open_new_file(self) -> None: - """Closes current file (if any) and opens a new output file.""" - if self._fh is not None: - self._fh.close() - - path = self._get_path_for_next_file() - try: - self._fh = open(path, "w") - logging.debug("Opened output file '%s'", path) - except Exception: - logging.exception("Failed to open output file '%s'", path) - raise - self.record_count = 0 - self.file_len = 0 - - def write_record(self, record: SeqRecord) -> None: - """Writes a SeqRecord to the current output file.""" - try: - SeqIO.write(record, self._fh, "fasta") - self.record_count += 1 - self.file_len += len(record.seq) - except Exception: - logging.exception("Failed to write record '%s' to output file", record.id) - raise - - def close(self) -> None: - if self._fh is not None: - self._fh.close() - self._fh = None - if self._agp_fh is not None: - self._agp_fh.close() - self._agp_fh = None - - -def _get_param_defaults() -> dict: - """Retrieve default values for Params class attributes.""" - signature = inspect.signature(Params.__init__) - defaults = {} - for name, param in signature.parameters.items(): - if name != "self" and param.default is not inspect.Parameter.empty: - defaults[name] = param.default - return defaults - - -def split_fasta(params: Params) -> None: - """Splits the input FASTA file into multiple smaller FASTA files, chunking long sequences if required.""" - if not params.fasta_file.exists(): - logging.error( - "DEBUG: fasta_file=%r resolved=%r cwd=%r", - str(params.fasta_file), - str(Path(params.fasta_file).resolve()), - str(Path.cwd()), - ) - raise FileNotFoundError(f"Fasta file '{params.fasta_file}' does not exist") - - # Do nothing if file size is 0 - if params.fasta_file.stat().st_size == 0: - logging.info("Input FASTA '%s' is empty; nothing to do", params.fasta_file) - return - - params.out_dir.mkdir(parents=True, exist_ok=True) - - writer = OutputWriter(params) - - try: - if params.write_agp: - writer.create_agp_file() - - with open_gz_file(params.fasta_file) as fh: - for record in SeqIO.parse(fh, "fasta"): - seq_len = len(record.seq) - max_seq_len = params.max_seq_length_per_file - max_seqs = params.max_seqs_per_file - - if max_seqs is not None and writer.record_count >= max_seqs: - writer.open_new_file() - - if max_seq_len is None or writer.file_len + seq_len <= max_seq_len: - writer.write_record(record) - if params.write_agp: - writer.add_agp_entry( - record.id, 1, seq_len, 1, record.id, seq_len - ) - continue - - if params.force_max_seq_length and seq_len > max_seq_len: - starts = list(range(0, seq_len, max_seq_len)) - ends = [min(s + max_seq_len, seq_len) for s in starts] - - if params.min_chunk_length is not None and len(starts) > 1: - last_chunk_len = ends[-1] - starts[-1] - if last_chunk_len < params.min_chunk_length: - logging.warning( - "Length of last chunk of record '%s' is %d, lower than min_chunk_length: %d;" - + "merging with previous chunk", - record.id, - last_chunk_len, - params.min_chunk_length, - ) - ends[-2] = seq_len - starts.pop() - ends.pop() - - for i, (start, end) in enumerate(zip(starts, ends), start=1): - chunk_seq = record.seq[start:end] - chunk_record = SeqRecord( - chunk_seq, - id=f"{record.id}_chunk_start_{start}", - description=f"{record.description} (part {i})", - ) - if writer.record_count > 0: - writer.open_new_file() - writer.write_record(chunk_record) - - if params.write_agp: - writer.add_agp_entry( - record.id, - start + 1, - end, - i, - chunk_record.id, - len(chunk_seq), - ) - else: - logging.warning( - "Record '%s' length %d exceeds max_seq_length_per_file %d but chunking not enabled", - record.id, - seq_len, - max_seq_len, - ) - if writer.record_count > 0: - writer.open_new_file() - writer.write_record(record) - if params.write_agp: - writer.add_agp_entry( - record.id, 1, seq_len, 1, record.id, seq_len - ) - except Exception: - logging.exception("Error processing FASTA file '%s'", params.fasta_file) - raise - finally: - writer.close() - - if params.delete_original_file: - try: - params.fasta_file.unlink(missing_ok=True) - except Exception: - logging.warning( - "Failed to delete original FASTA file '%s'", - params.fasta_file, - exc_info=True, - ) - - -def parse_args(argv: Optional[List[str]] = None) -> Params: - defaults = _get_param_defaults() - parser = ArgumentParser( - description="Split a FASTA file into multiple FASTA files, optionally chunking long sequences." - ) - parser.add_argument( - "--fasta-file", - type=Path, - required=True, - help="Input raw or compressed FASTA file containing sequences to split", - ) - parser.add_argument( - "--out-dir", - type=Path, - help="Top-level output directory (default: input FASTA directory)", - ) - parser.add_argument( - "--write-agp", - action="store_true", - help=f"Write AGP file describing the splits (default: {defaults['write_agp']})", - ) - parser.add_argument( - "--max-seqs-per-file", - type=int, - help=f"Max records per output file (default: {defaults['max_seqs_per_file']})", - ) - parser.add_argument( - "--max-seq-length-per-file", - type=int, - help=f"Max cumulative sequence length per output file (default: {defaults['max_seq_length_per_file']})", - ) - parser.add_argument( - "--min-chunk-length", - type=int, - help=f"Minimum length of a chunk allowed as a remainder (default: {defaults['min_chunk_length']})", - ) - parser.add_argument( - "--max-files-per-directory", - type=int, - help=f"Max files per directory before moving to next computed dir (default: {defaults['max_files_per_directory']})", - ) - parser.add_argument( - "--max-dirs-per-directory", - type=int, - help=f"Max subdirectories per directory level (default: {defaults['max_dirs_per_directory']})", - ) - parser.add_argument( - "--delete-existing-files", - action="store_true", - help=f"Delete existing files within computed output dirs (default: {defaults['delete_existing_files']})", - ) - parser.add_argument( - "--unique-file-names", - action="store_true", - help=f"Make output file names unique across dirs by including dir_index (default: {defaults['unique_file_names']})", - ) - parser.add_argument( - "--delete-original-file", - action="store_true", - help=f"Delete original input FASTA after splitting (default: {defaults['delete_original_file']})", - ) - parser.add_argument( - "--force-max-seq-length", - action="store_true", - help=f"Chunk single sequences longer than max-seq-length-per-file (default: {defaults['force_max_seq_length']})", - ) - - args = parser.parse_args(argv) - init_logging_with_args(args) - - params = Params( - fasta_file=args.fasta_file, - out_dir=args.out_dir, - write_agp=args.write_agp, - max_seqs_per_file=args.max_seqs_per_file, - max_seq_length_per_file=args.max_seq_length_per_file, - min_chunk_length=args.min_chunk_length, - max_files_per_directory=args.max_files_per_directory, - max_dirs_per_directory=args.max_dirs_per_directory, - delete_existing_files=args.delete_existing_files, - unique_file_names=args.unique_file_names, - delete_original_file=args.delete_original_file, - force_max_seq_length=args.force_max_seq_length, - ) - return params - - -def main(argv: Optional[List[str]] = None) -> None: - try: - params = parse_args(argv) - split_fasta(params) - except Exception: - logging.exception("Error processing FASTA file '%s'", params.fasta_file) - raise - - -if __name__ == "__main__": - main() diff --git a/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp b/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp deleted file mode 100644 index 46fc419..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/agp/test.agp +++ /dev/null @@ -1,4 +0,0 @@ -# AGP-version 2.0 -seq1 1 10 1 W seq1 1 10 + -seq2 1 10 1 W seq2 1 10 + -seq3 1 11 1 W seq3 1 11 + \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa b/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa deleted file mode 100644 index 3d3f65c..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/real/in.fa +++ /dev/null @@ -1,6 +0,0 @@ ->seq1 -AAAAAAAAAA ->seq2 -CCCCCCCCCC ->seq3 -GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa deleted file mode 100644 index 7abe938..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.1.fa +++ /dev/null @@ -1,4 +0,0 @@ ->seq1 -AAAAAAAAAA ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa deleted file mode 100644 index 6287efa..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/default/0/test.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq3 -GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa deleted file mode 100644 index 9512f36..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/0/test.1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa deleted file mode 100644 index 2f3b40f..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/multi_dir/0/1/test.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa deleted file mode 100644 index 9512f36..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa b/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa deleted file mode 100644 index 2f3b40f..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/data/splits/unique/0/test.0.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test b/modules/ensembl/fasta/splitfasta/tests/main.nf.test deleted file mode 100644 index 3db1283..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test +++ /dev/null @@ -1,301 +0,0 @@ -// nf-core modules test fasta/splitfasta -nextflow_process { - - name "Test Process FASTA_SPLITFASTA" - script "../main.nf" - process "FASTA_SPLITFASTA" - - tag "modules" - tag "modules_ensembl" - tag "fasta" - tag "fasta/splitfasta" - - - def real_fa = new File("modules/ensembl/fasta/splitfasta/tests/data/real/in.fa").canonicalFile - - test("Stub outputs: default layout, no AGP") { - - when { - options "-stub" - - // Ensure params are set explicitly for this test - params.write_agp = false - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - - process { - """ - input[0] = [[ id:'test' ], file('dummy.fa')] - """ - } - } - - then { - assert snapshot(process.out).match() - - // fasta: tuple(meta, fa_paths) - assert process.out.fasta != null - assert process.out.fasta.size() == 1 - - def fasta_out = process.out.fasta[0] - def meta = fasta_out[0] - def fas = fasta_out[1] - - assert meta.id == "test" - assert fas != null - assert fas.size() == 2 - - // agp: tuple(meta, agp_paths) optional -> should be absent - assert process.out.agp != null - assert process.out.agp.size() == 0 - - // Ensure FASTA parsing works (downstream contract) - def merged = fas - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - - assertAll( - { assert process.success } - ) - } - } - - test("Stub outputs: AGP optional output appears when enabled") { - - when { - options "-stub" - - params.write_agp = true - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - - process { - """ - input[0] = [[ id:'test' ], file('dummy.fa')] - """ - } - } - - then { - assert snapshot(process.out).match() - - assert process.out.fasta.size() == 1 - def fasta_out = process.out.fasta[0] - def fas = fasta_out[1] - assert fas.size() == 2 - - assert process.out.agp.size() == 1 - def agp_out = process.out.agp[0] - def agp_meta = agp_out[0] - def agp = agp_out[1] - def agp_paths = agp instanceof List ? agp : [agp] - def agp_file = path(agp_paths[0]).toFile() - - assert agp_meta.id == "test" - assert agp_paths.size() == 1 - assert agp_file.name == "test.agp" - - def agp_text = agp_file.text - assert agp_text.startsWith("# AGP-version 2.0") - assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") - assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") - assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") - - assertAll( - { assert process.success } - ) - } - } - - test("Stub outputs: unique_file_names contract") { - - when { - options "-stub" - - params.write_agp = false - params.unique_file_names = true - params.max_files_per_directory = null - params.max_dirs_per_directory = null - - process { - """ - input[0] = [[ id:'test' ], file('dummy.fa')] - """ - } - } - - then { - assert snapshot(process.out).match() - - def fasta_out = process.out.fasta[0] - def fas = fasta_out[1] - - assert fas.size() == 2 - assert process.out.agp.size() == 0 - - // Contract check: names match the unique fixture pattern - assert fas.collect { path(it).toFile().name }.sort() == ["test.0.1.fa", "test.0.2.fa"] - - assertAll( - { assert process.success } - ) - } - } - - test("Stub outputs: nested directory layout contract") { - - when { - options "-stub" - - params.write_agp = false - params.unique_file_names = false - - // Trigger stub's nested fixture selection - params.max_files_per_directory = 100 - params.max_dirs_per_directory = 100 - - process { - """ - input[0] = [[ id:'test' ], file('dummy.fa')] - """ - } - } - - then { - assert snapshot(process.out).match() - - def fastas = process.out.fasta[0][1] - assert fastas.size() == 2 - assert process.out.agp.size() == 0 - - def rels = fastas.collect { path(it).toString() } - assert rels.any { it.contains("splits/0/0/") } - assert rels.any { it.contains("splits/0/1/") } - - assertAll( - { assert process.success } - ) - } - } - - test("Real run: default behaviour produces FASTAs and no AGP") { - - when { - params.write_agp = false - params.unique_file_names = false - params.max_seqs_per_file = null - params.max_seq_length_per_file = null - params.max_files_per_directory = null - params.max_dirs_per_directory = null - params.force_max_seq_length = false - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - assert process.out.fasta != null - assert process.out.fasta.size() == 1 - - def out = process.out.fasta[0] - def meta = out[0] - def fas = out[1] - - assert meta.id == "test" - def fas_list = (fas instanceof List) ? fas : [fas] - assert fas_list.size() >= 1 - - assert process.out.agp != null - assert process.out.agp.size() == 0 - - def merged = fas_list - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - } - } - - test("Real run: write_agp=true emits exactly one AGP file") { - - when { - params.write_agp = true - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - params.max_seqs_per_file = null - params.max_seq_length_per_file = null - params.force_max_seq_length = false - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - assert process.out.agp != null - assert process.out.agp.size() == 1 - - def agp_out = process.out.agp[0] - def agp_meta = agp_out[0] - def agp_val = agp_out[1] - - assert agp_meta.id == "test" - - def agp_list = (agp_val instanceof List) ? agp_val : [agp_val] - assert agp_list.size() == 1 - - def agp_path = path(agp_list[0]) - assert agp_path.fileName.toString().endsWith(".agp") - - def agp_text = agp_path.toFile().text - assert agp_text.startsWith("# AGP-version 2.0") - assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") - assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") - assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") - } - } - - test("Real run: max_seqs_per_file=2 splits into 2 FASTA outputs") { - - when { - params.write_agp = false - params.max_seqs_per_file = 2 - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - def fas = process.out.fasta[0][1] - assert fas.size() == 2 - - def merged = fas - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - } - } -} diff --git a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap b/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap deleted file mode 100644 index 3390583..0000000 --- a/modules/ensembl/fasta/splitfasta/tests/main.nf.test.snap +++ /dev/null @@ -1,168 +0,0 @@ -{ - "Stub outputs: AGP optional output appears when enabled": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" - ] - ], - "agp": [ - [ - { - "id": "test" - }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" - ] - ], - "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-01-30T10:38:07.606463" - }, - "Stub outputs: nested directory layout contract": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] - ], - "1": [ - - ], - "agp": [ - - ], - "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-01-30T10:38:11.815126" - }, - "Stub outputs: default layout, no AGP": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] - ], - "1": [ - - ], - "agp": [ - - ], - "fasta": [ - [ - { - "id": "test" - }, - [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" - ] - ] - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-01-30T10:38:05.482323" - }, - "Stub outputs: unique_file_names contract": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] - ], - "1": [ - - ], - "agp": [ - - ], - "fasta": [ - [ - { - "id": "test" - }, - [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" - ] - ] - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.3" - }, - "timestamp": "2026-01-30T10:38:09.698407" - } -} \ No newline at end of file diff --git a/tests/config/nextflow.config b/tests/config/nextflow.config index e4c8606..a527e1f 100644 --- a/tests/config/nextflow.config +++ b/tests/config/nextflow.config @@ -16,5 +16,5 @@ includeConfig 'test_data.config' singularity { - enabled = true + enabled = false } diff --git a/tests/test_split_fasta.py b/tests/test_split_fasta.py deleted file mode 100644 index 8a48af2..0000000 --- a/tests/test_split_fasta.py +++ /dev/null @@ -1,144 +0,0 @@ -# tests/test_split_fasta.py -from pathlib import Path - -import pytest -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord - - -def write_fasta(path: Path, records): - with open(path, "w", encoding="utf-8", newline="\n") as fh: - SeqIO.write(records, fh, "fasta") - - -def list_output_fastas(out_dir: Path): - return sorted(out_dir.rglob("*.fa")) - - -def read_all_ids_from_fastas(out_dir: Path): - ids = [] - for fa in list_output_fastas(out_dir): - with open(fa, "r", encoding="utf-8") as fh: - ids.extend([r.id for r in SeqIO.parse(fh, "fasta")]) - return ids - - -def parse_agp_lines(agp_path: Path): - lines = [l.rstrip("\n") for l in agp_path.read_text(encoding="utf-8").splitlines()] - lines = [l for l in lines if l and not l.startswith("#")] - return [l.split("\t") for l in lines] - - -def test_no_agp_by_default(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" - out = tmp_path / "out" - write_fasta(inp, [SeqRecord(Seq("ACGT"), id="seq1", description="")]) - - params = split_fasta_module.Params( - fasta_file=inp, - out_dir=out, - write_agp=False, - ) - split_fasta_module.split_fasta(params) - - assert not (out / "in.agp").exists() - assert len(list_output_fastas(out)) >= 1 - - -def test_split_by_max_seqs_per_file(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" - out = tmp_path / "out" - recs = [ - SeqRecord(Seq("A" * 10), id="s1", description=""), - SeqRecord(Seq("C" * 10), id="s2", description=""), - SeqRecord(Seq("G" * 10), id="s3", description=""), - ] - write_fasta(inp, recs) - - params = split_fasta_module.Params( - fasta_file=inp, - out_dir=out, - max_seqs_per_file=2, - write_agp=False, - ) - split_fasta_module.split_fasta(params) - - fas = list_output_fastas(out) - assert len(fas) == 2 - assert read_all_ids_from_fastas(out) == ["s1", "s2", "s3"] - - -def test_chunk_merge_final_small_chunk_and_agp(tmp_path: Path, split_fasta_module): - """ - seq_len=2100, max=1000 -> chunks [1000, 1000, 100] - min_chunk_length=200 -> final chunk merged -> [1000, 1100] - """ - inp = tmp_path / "in.fa" - out = tmp_path / "out" - write_fasta(inp, [SeqRecord(Seq("A" * 2100), id="chr1", description="chr1")]) - - params = split_fasta_module.Params( - fasta_file=inp, - out_dir=out, - write_agp=True, - force_max_seq_length=True, - max_seq_length_per_file=1000, - min_chunk_length=200, - max_seqs_per_file=100000, # avoid seq-count splitting interfering - ) - split_fasta_module.split_fasta(params) - - # 2 chunks expected after merge - assert read_all_ids_from_fastas(out) == [ - "chr1_chunk_start_0", - "chr1_chunk_start_1000", - ] - - agp = out / "in.agp" - assert agp.exists() - - cols = parse_agp_lines(agp) - assert len(cols) == 2 - - # object, obj_beg, obj_end, part_no, type, comp_id, comp_beg, comp_end, orient - assert cols[0][0] == "chr1" - assert cols[0][1:4] == ["1", "1000", "1"] - assert cols[0][4] == "W" - assert cols[0][5] == "chr1_chunk_start_0" - assert cols[0][6:9] == ["1", "1000", "+"] - - assert cols[1][0] == "chr1" - assert cols[1][1:4] == ["1001", "2100", "2"] - assert cols[1][4] == "W" - assert cols[1][5] == "chr1_chunk_start_1000" - assert cols[1][6:9] == ["1", "1100", "+"] - - -def test_agp_part_numbers_restart_per_object(tmp_path: Path, split_fasta_module): - inp = tmp_path / "in.fa" - out = tmp_path / "out" - recs = [ - SeqRecord(Seq("A" * 1200), id="obj1", description=""), - SeqRecord(Seq("C" * 1200), id="obj2", description=""), - ] - write_fasta(inp, recs) - - params = split_fasta_module.Params( - fasta_file=inp, - out_dir=out, - write_agp=True, - force_max_seq_length=True, - max_seq_length_per_file=1000, - min_chunk_length=100, # => 2 chunks each, no merge - ) - split_fasta_module.split_fasta(params) - - cols = parse_agp_lines(out / "in.agp") - - by_obj = {} - for c in cols: - by_obj.setdefault(c[0], []).append(int(c[3])) - - assert by_obj["obj1"] == [1, 2] - assert by_obj["obj2"] == [1, 2] From 824066c1b8eceff48ef98eb67af437d9549c7707 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Wed, 11 Mar 2026 12:29:31 +0000 Subject: [PATCH 22/36] Remove use of test data --- modules/ensembl/fasta/recombine/main.nf | 24 +- .../recombine/tests/data/agp/inputs/part1.fa | 2 - .../recombine/tests/data/agp/inputs/part2.fa | 2 - .../recombine/tests/data/agp/manifest.txt | 2 - .../recombine/tests/data/agp/output/test.fa | 2 - .../fasta/recombine/tests/data/agp/test.agp | 3 - .../tests/data/custom_regex/inputs/seq1_1.fa | 2 - .../tests/data/custom_regex/inputs/seq1_5.fa | 2 - .../tests/data/custom_regex/manifest.txt | 2 - .../tests/data/custom_regex/output/test.fa | 2 - .../data/header/inputs/seq1_chunk_start_1.fa | 2 - .../data/header/inputs/seq1_chunk_start_5.fa | 2 - .../tests/data/header/inputs/seq2.fa | 2 - .../recombine/tests/data/header/manifest.txt | 3 - .../tests/data/header/output/test.fa | 4 - .../tests/data/order/inputs/01_second.fa | 2 - .../tests/data/order/inputs/02_first.fa | 2 - .../recombine/tests/data/order/manifest.txt | 2 - .../recombine/tests/data/order/output/test.fa | 4 - .../fasta/recombine/tests/main.nf.test | 117 +--- .../fasta/recombine/tests/main.nf.test.snap | 124 +---- modules/ensembl/fasta/split/main.nf | 48 +- .../fasta/split/tests/data/agp/test.agp | 4 - .../ensembl/fasta/split/tests/data/real/in.fa | 6 - .../tests/data/splits/default/0/test.1.fa | 4 - .../tests/data/splits/default/0/test.2.fa | 2 - .../tests/data/splits/multi_dir/0/0/test.1.fa | 2 - .../tests/data/splits/multi_dir/0/1/test.2.fa | 2 - .../tests/data/splits/unique/0/test.0.1.fa | 2 - .../tests/data/splits/unique/0/test.0.2.fa | 2 - .../ensembl/fasta/split/tests/main.nf.test | 136 +---- .../fasta/split/tests/main.nf.test.snap | 68 ++- modules/ensembl/features/combine_json/main.nf | 52 +- .../tests/data/ncrna/agp/.DS_Store | Bin 6148 -> 0 bytes .../tests/data/ncrna/agp/inputs/in.json | 27 - .../tests/data/ncrna/agp/manifest.txt | 1 - .../data/ncrna/agp/output/test.features.json | 27 - .../tests/data/ncrna/agp/test.agp | 1 - .../tests/data/ncrna/custom_regex/.DS_Store | Bin 6148 -> 0 bytes .../data/ncrna/custom_regex/inputs/a.json | 27 - .../data/ncrna/custom_regex/inputs/b.json | 27 - .../data/ncrna/custom_regex/manifest.txt | 2 - .../custom_regex/output/test.features.json | 37 -- .../tests/data/ncrna/order/.DS_Store | Bin 6148 -> 0 bytes .../tests/data/ncrna/order/inputs/01.json | 27 - .../tests/data/ncrna/order/inputs/02.json | 27 - .../tests/data/ncrna/order/manifest.txt | 2 - .../ncrna/order/output/test.features.json | 37 -- .../tests/data/ncrna/seq_region/.DS_Store | Bin 6148 -> 0 bytes .../tests/data/ncrna/seq_region/inputs/a.json | 27 - .../tests/data/ncrna/seq_region/inputs/b.json | 27 - .../tests/data/ncrna/seq_region/manifest.txt | 2 - .../seq_region/output/test.features.json | 37 -- .../tests/data/repeat/agp/inputs/in.json | 34 -- .../tests/data/repeat/agp/manifest.txt | 1 - .../data/repeat/agp/output/test.features.json | 34 -- .../tests/data/repeat/agp/test.agp | 1 - .../data/repeat/custom_regex/inputs/in.json | 34 -- .../data/repeat/custom_regex/manifest.txt | 1 - .../custom_regex/output/test.features.json | 34 -- .../tests/data/repeat/order/inputs/01.json | 34 -- .../tests/data/repeat/order/inputs/02.json | 34 -- .../tests/data/repeat/order/manifest.txt | 2 - .../repeat/order/output/test.features.json | 43 -- .../data/repeat/seq_region/inputs/a.json | 34 -- .../data/repeat/seq_region/inputs/b.json | 34 -- .../tests/data/repeat/seq_region/manifest.txt | 2 - .../seq_region/output/test.features.json | 43 -- .../features/combine_json/tests/main.nf.test | 221 ++------ .../combine_json/tests/main.nf.test.snap | 522 +----------------- 70 files changed, 227 insertions(+), 1851 deletions(-) delete mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt delete mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/agp/test.agp delete mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt delete mode 100644 modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/header/manifest.txt delete mode 100644 modules/ensembl/fasta/recombine/tests/data/header/output/test.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa delete mode 100644 modules/ensembl/fasta/recombine/tests/data/order/manifest.txt delete mode 100644 modules/ensembl/fasta/recombine/tests/data/order/output/test.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/agp/test.agp delete mode 100644 modules/ensembl/fasta/split/tests/data/real/in.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa delete mode 100644 modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/inputs/01.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/inputs/02.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/order/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/.DS_Store delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/agp/inputs/in.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/agp/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/agp/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/agp/test.agp delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/inputs/in.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/01.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/02.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/order/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/order/output/test.features.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt delete mode 100644 modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 01b53dc..057c98f 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -55,7 +55,7 @@ process FASTA_RECOMBINE { cat <<-END_VERSIONS > versions.yml ${task.process}: - fasta_recombine: $(fasta_recombine --version 2>/dev/null | head -n 1) + fasta_recombine: \$(fasta_recombine --version 2>/dev/null | head -n 1) END_VERSIONS """ @@ -63,26 +63,12 @@ process FASTA_RECOMBINE { """ set -euo pipefail - test_data_dir="${moduleDir}/tests/data" + out_fa="${meta.id}.fa" + touch "\$out_fa" - out_fasta="${meta.id}.fa" - - test -s "${fasta_manifest}" - - mode="header" - agp_path="${agp}" - agp_name="\${agp_path##*/}" - if [[ "\$agp_name" != "NO_FILE" ]]; then - mode="agp" - fi - - - cp "\$test_data_dir/\$mode/output/${meta.id}.fa" "\$out_fasta" - cat <<-END_VERSIONS > versions.yml ${task.process}: - fasta_recombine: $(fasta_recombine --version 2>/dev/null | head -n 1) + fasta_recombine: stub END_VERSIONS - """ - + """ } diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa b/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa deleted file mode 100644 index dafb755..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->part1 -AAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa b/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa deleted file mode 100644 index 0fc377e..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/agp/inputs/part2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->part2 -CCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt deleted file mode 100644 index b128cbe..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/agp/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/part1.fa -inputs/part2.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa deleted file mode 100644 index b53532e..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/agp/output/test.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -AAAAAACCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/agp/test.agp b/modules/ensembl/fasta/recombine/tests/data/agp/test.agp deleted file mode 100644 index a73c8db..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/agp/test.agp +++ /dev/null @@ -1,3 +0,0 @@ -##agp-version 2.0 -seq1 1 6 1 W part1 1 6 + -seq1 7 12 2 W part2 1 6 + \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa deleted file mode 100644 index 0af2767..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seqY_1 -CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa deleted file mode 100644 index c722026..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/custom_regex/inputs/seq1_5.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seqY_5 -GGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt deleted file mode 100644 index a125950..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/custom_regex/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/seq1_1.fa -inputs/seq1_5.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa deleted file mode 100644 index 46d11a6..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/custom_regex/output/test.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -CCCCGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa deleted file mode 100644 index 17d88e1..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1_chunk_start_1 -AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa deleted file mode 100644 index b6646f2..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq1_chunk_start_5.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1_chunk_start_5 -CCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa b/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa deleted file mode 100644 index 70d86fb..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/header/inputs/seq2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq2 -GGGGTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt deleted file mode 100644 index a34084d..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/header/manifest.txt +++ /dev/null @@ -1,3 +0,0 @@ -inputs/seq1_chunk_start_1.fa -inputs/seq1_chunk_start_5.fa -inputs/seq2.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa deleted file mode 100644 index d3bbb3d..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/header/output/test.fa +++ /dev/null @@ -1,4 +0,0 @@ ->seq1 -AAAACCCC ->seq2 -GGGGTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa b/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa deleted file mode 100644 index d06c158..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/order/inputs/01_second.fa +++ /dev/null @@ -1,2 +0,0 @@ ->second second_record -TTTT \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa b/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa deleted file mode 100644 index 1e20e1f..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/order/inputs/02_first.fa +++ /dev/null @@ -1,2 +0,0 @@ ->first first_record -AAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt b/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt deleted file mode 100644 index dae8a10..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/order/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/02_first.fa -inputs/01_second.fa \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa b/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa deleted file mode 100644 index b3b6d1e..0000000 --- a/modules/ensembl/fasta/recombine/tests/data/order/output/test.fa +++ /dev/null @@ -1,4 +0,0 @@ ->first first_record -AAAA ->second second_record -TTTT diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index 91844ba..9a7a6c9 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -25,7 +25,6 @@ nextflow_process { tag "fasta" tag "fasta/recombine" - test("Stub outputs: header mode") { when { @@ -33,10 +32,17 @@ nextflow_process { process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/header/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] + def manifest = file("manifest.txt") + manifest.text = "x\\n" + + def no_file = file("NO_FILE") + no_file.text = "" + input[0] = [ + [ id: 'test' ], + manifest, + no_file + ] """ } } @@ -49,7 +55,6 @@ nextflow_process { } } - test("Stub outputs: AGP mode") { when { @@ -57,51 +62,16 @@ nextflow_process { process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/manifest.txt'), - file('${moduleDir}/tests/data/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.recombined_fasta.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - - test("Real run: header recombination") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/header/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.recombined_fasta.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - - test("Real run: AGP recombination") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/agp/manifest.txt'), - file('${moduleDir}/tests/data/agp/test.agp')] + def manifest = file("manifest.txt") + manifest.text = "x\\n" + + def agp = file("test.agp") + agp.text = "" + input[0] = [ + [ id: 'test' ], + manifest, + agp + ] """ } } @@ -113,49 +83,4 @@ nextflow_process { assert snapshot(process.out).match() } } - - - test("Real run: custom chunk regex") { - - when { - params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' - - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/custom_regex/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.recombined_fasta.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - - test("Real run: manifest order is preserved") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/order/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.recombined_fasta.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } -} - +} \ No newline at end of file diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 3a27deb..3c98f07 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -7,74 +7,26 @@ { "id": "test" }, - "test.fa:md5,3ec81eef9dd73dc86ff01621dbacc7a0" + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "recombined_fasta": [ - [ - { - "id": "test" - }, - "test.fa:md5,3ec81eef9dd73dc86ff01621dbacc7a0" - ] - ] - } - ], - "timestamp": "2026-02-18T23:12:05.089688", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: AGP recombination": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.fa:md5,f32bc79faea4bc05dd4675e0d4ededa1" - ] + "1": [ + "versions.yml:md5,191cc20355b504364a619df6b4c639aa" ], "recombined_fasta": [ [ { "id": "test" }, - "test.fa:md5,f32bc79faea4bc05dd4675e0d4ededa1" - ] - ] - } - ], - "timestamp": "2026-02-18T23:12:09.601838", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: header recombination": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.fa:md5,700550164316730d1145b7bde2ae3eb7" + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "recombined_fasta": [ - [ - { - "id": "test" - }, - "test.fa:md5,700550164316730d1145b7bde2ae3eb7" - ] + "versions": [ + "versions.yml:md5,191cc20355b504364a619df6b4c639aa" ] } ], - "timestamp": "2026-02-18T23:12:07.342405", + "timestamp": "2026-03-11T12:20:11.373089", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -88,74 +40,26 @@ { "id": "test" }, - "test.fa:md5,93d1870d020e197708753501e57db68f" + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "recombined_fasta": [ - [ - { - "id": "test" - }, - "test.fa:md5,93d1870d020e197708753501e57db68f" - ] - ] - } - ], - "timestamp": "2026-02-18T23:12:03.015143", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: custom chunk regex": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.fa:md5,a589b60028be69f01622a61cc78fa1ae" - ] + "1": [ + "versions.yml:md5,191cc20355b504364a619df6b4c639aa" ], "recombined_fasta": [ [ { "id": "test" }, - "test.fa:md5,a589b60028be69f01622a61cc78fa1ae" - ] - ] - } - ], - "timestamp": "2026-02-18T23:12:11.852053", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: manifest order is preserved": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.fa:md5,52fa2054da674f0a5ebc263e724cf4a4" + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "recombined_fasta": [ - [ - { - "id": "test" - }, - "test.fa:md5,52fa2054da674f0a5ebc263e724cf4a4" - ] + "versions": [ + "versions.yml:md5,191cc20355b504364a619df6b4c639aa" ] } ], - "timestamp": "2026-02-18T23:12:14.083842", + "timestamp": "2026-03-11T12:20:09.308095", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 8cc900b..4a33e00 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,15 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -def fasta_split_mem(longest_seq_bp) { - if( !longest_seq_bp || longest_seq_bp <= 0 ) return 8.GB - - // Heuristic: ~2.5 bytes/base peak => ~1 GB per 400 Mbp of the *longest* sequence - // Add 2GB base memory to account for overhead - def mem_gb = 2 + Math.ceil(longest_seq_bp as double / 400_000_000d) - return mem_gb.GB -} - process FASTA_SPLIT { tag "${meta.id}" @@ -87,7 +78,7 @@ process FASTA_SPLIT { cat <<-END_VERSIONS > versions.yml ${task.process}: - fasta_split: $(fasta_split --version 2>/dev/null | head -n 1) + fasta_split: \$(fasta_split --version 2>/dev/null | head -n 1) END_VERSIONS """ @@ -95,8 +86,6 @@ process FASTA_SPLIT { """ set -euo pipefail - test_data_dir="${moduleDir}/tests/data" - layout="default" if [[ "${params.unique_file_names ?: false}" == "true" ]]; then layout="unique" @@ -105,17 +94,42 @@ process FASTA_SPLIT { fi mkdir -p splits - cp -R "\$test_data_dir/splits/\$layout/." "splits/" + + if [[ "\$layout" == "default" ]]; then + mkdir -p splits/0 + touch splits/0/test.1.fa + touch splits/0/test.2.fa + + elif [[ "\$layout" == "unique" ]]; then + mkdir -p splits/0 + touch splits/0/test.0.1.fa + touch splits/0/test.0.2.fa + + elif [[ "\$layout" == "multi_dir" ]]; then + mkdir -p splits/0/0 + mkdir -p splits/0/1 + touch splits/0/0/test.1.fa + touch splits/0/1/test.2.fa + fi if [[ "${params.write_agp ?: false}" == "true" ]]; then - cp "\$test_data_dir/agp/test.agp" "splits/${meta.id}.agp" + touch "splits/${meta.id}.agp" fi cat <<-END_VERSIONS > versions.yml ${task.process}: - fasta_split: $(fasta_split --version 2>/dev/null | head -n 1) + fasta_split: stub END_VERSIONS - """ + """ +} + + +def fasta_split_mem(longest_seq_bp) { + if( !longest_seq_bp || longest_seq_bp <= 0 ) return 8.GB - + // Heuristic: ~2.5 bytes/base peak => ~1 GB per 400 Mbp of the *longest* sequence + // Add 2GB base memory to account for overhead + def mem_gb = 2 + Math.ceil(longest_seq_bp as double / 400_000_000d) + return mem_gb.GB } + diff --git a/modules/ensembl/fasta/split/tests/data/agp/test.agp b/modules/ensembl/fasta/split/tests/data/agp/test.agp deleted file mode 100644 index 46fc419..0000000 --- a/modules/ensembl/fasta/split/tests/data/agp/test.agp +++ /dev/null @@ -1,4 +0,0 @@ -# AGP-version 2.0 -seq1 1 10 1 W seq1 1 10 + -seq2 1 10 1 W seq2 1 10 + -seq3 1 11 1 W seq3 1 11 + \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/real/in.fa b/modules/ensembl/fasta/split/tests/data/real/in.fa deleted file mode 100644 index 3d3f65c..0000000 --- a/modules/ensembl/fasta/split/tests/data/real/in.fa +++ /dev/null @@ -1,6 +0,0 @@ ->seq1 -AAAAAAAAAA ->seq2 -CCCCCCCCCC ->seq3 -GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa b/modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa deleted file mode 100644 index 7abe938..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/default/0/test.1.fa +++ /dev/null @@ -1,4 +0,0 @@ ->seq1 -AAAAAAAAAA ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa b/modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa deleted file mode 100644 index 6287efa..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/default/0/test.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq3 -GGGGGGGGGGG \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa b/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa deleted file mode 100644 index 9512f36..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/0/test.1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa b/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa deleted file mode 100644 index 2f3b40f..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/multi_dir/0/1/test.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa b/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa deleted file mode 100644 index 9512f36..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.1.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq1 -AAAAAAAAAA \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa b/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa deleted file mode 100644 index 2f3b40f..0000000 --- a/modules/ensembl/fasta/split/tests/data/splits/unique/0/test.0.2.fa +++ /dev/null @@ -1,2 +0,0 @@ ->seq2 -CCCCCCCCCC \ No newline at end of file diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test index 37211ae..5aa3acf 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -26,14 +26,11 @@ nextflow_process { tag "fasta/split" - def real_fa = new File("modules/ensembl/fasta/split/tests/data/real/in.fa").canonicalFile - test("Stub outputs: default layout, no AGP") { when { options "-stub" - // Ensure params are set explicitly for this test params.write_agp = false params.unique_file_names = false params.max_files_per_directory = null @@ -49,7 +46,6 @@ nextflow_process { then { assert snapshot(process.out).match() - // fasta: tuple(meta, fa_paths) assert process.out.fastas != null assert process.out.fastas.size() == 1 @@ -60,18 +56,11 @@ nextflow_process { assert meta.id == "test" assert fas != null assert fas.size() == 2 + assert fas.collect { path(it).toFile().name }.sort() == ["test.1.fa", "test.2.fa"] - // agp: tuple(meta, agp_paths) optional -> should be absent assert process.out.agp != null assert process.out.agp.size() == 0 - // Ensure FASTA parsing works (downstream contract) - def merged = fas - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - assertAll( { assert process.success } ) @@ -114,12 +103,6 @@ nextflow_process { assert agp_paths.size() == 1 assert agp_file.name == "test.agp" - def agp_text = agp_file.text - assert agp_text.startsWith("# AGP-version 2.0") - assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") - assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") - assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") - assertAll( { assert process.success } ) @@ -196,121 +179,4 @@ nextflow_process { ) } } - - test("Real run: default behaviour produces FASTAs and no AGP") { - - when { - params.write_agp = false - params.unique_file_names = false - params.max_seqs_per_file = null - params.max_seq_length_per_file = null - params.max_files_per_directory = null - params.max_dirs_per_directory = null - params.force_max_seq_length = false - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - assert process.out.fastas != null - assert process.out.fastas.size() == 1 - - def out = process.out.fastas[0] - def meta = out[0] - def fas = out[1] - - assert meta.id == "test" - def fas_list = (fas instanceof List) ? fas : [fas] - assert fas_list.size() >= 1 - - assert process.out.agp != null - assert process.out.agp.size() == 0 - - def merged = fas_list - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - } - } - - test("Real run: write_agp=true emits exactly one AGP file") { - - when { - params.write_agp = true - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - params.max_seqs_per_file = null - params.max_seq_length_per_file = null - params.force_max_seq_length = false - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - assert process.out.agp != null - assert process.out.agp.size() == 1 - - def agp_out = process.out.agp[0] - def agp_meta = agp_out[0] - def agp_val = agp_out[1] - - assert agp_meta.id == "test" - - def agp_list = (agp_val instanceof List) ? agp_val : [agp_val] - assert agp_list.size() == 1 - - def agp_path = path(agp_list[0]) - assert agp_path.fileName.toString().endsWith(".agp") - - def agp_text = agp_path.toFile().text - assert agp_text.startsWith("# AGP-version 2.0") - assert agp_text.contains("seq1\t1\t10\t1\tW\tseq1\t1\t10\t+") - assert agp_text.contains("seq2\t1\t10\t1\tW\tseq2\t1\t10\t+") - assert agp_text.contains("seq3\t1\t11\t1\tW\tseq3\t1\t11\t+") - } - } - - test("Real run: max_seqs_per_file=2 splits into 2 FASTA outputs") { - - when { - params.write_agp = false - params.max_seqs_per_file = 2 - params.unique_file_names = false - params.max_files_per_directory = null - params.max_dirs_per_directory = null - - process { - """ - input[0] = [[ id:'test' ], file('${real_fa.absolutePath}')] - """ - } - } - - then { - assert process.success - - def fas = process.out.fastas[0][1] - assert fas.size() == 2 - - def merged = fas - .collect { path(it).fasta } - .inject([:]) { acc, m -> acc + m } - - assert merged.keySet().containsAll(["seq1", "seq2", "seq3"]) - } - } } diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index eb12321..d736a2a 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -8,8 +8,8 @@ "id": "test" }, [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -18,15 +18,18 @@ { "id": "test" }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], + "2": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + ], "agp": [ [ { "id": "test" }, - "test.agp:md5,c12ac51bd2b1ca95cdd8f011eca0cd1c" + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "fastas": [ @@ -35,14 +38,17 @@ "id": "test" }, [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] + ], + "versions": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ] } ], - "timestamp": "2026-02-18T23:21:51.036982", + "timestamp": "2026-03-11T12:20:33.334793", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -57,13 +63,16 @@ "id": "test" }, [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], "1": [ + ], + "2": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ], "agp": [ @@ -74,14 +83,17 @@ "id": "test" }, [ - "test.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] + ], + "versions": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ] } ], - "timestamp": "2026-02-18T23:06:24.284416", + "timestamp": "2026-03-11T12:20:37.504172", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -96,13 +108,16 @@ "id": "test" }, [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], "1": [ + ], + "2": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ], "agp": [ @@ -113,14 +128,17 @@ "id": "test" }, [ - "test.1.fa:md5,336490c5e8c624cb1ae29048f28f0978", - "test.2.fa:md5,55d5ca305356033516f7ae1b5ecca900" + "test.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] + ], + "versions": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ] } ], - "timestamp": "2026-02-18T23:06:18.00303", + "timestamp": "2026-03-11T12:20:31.268587", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -135,13 +153,16 @@ "id": "test" }, [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + "test.0.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.0.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], "1": [ + ], + "2": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ], "agp": [ @@ -152,14 +173,17 @@ "id": "test" }, [ - "test.0.1.fa:md5,41e176f082cc04841e50d8aa5c4f4d5a", - "test.0.2.fa:md5,e3bd0305f6466c13a1479c6b82391e6f" + "test.0.1.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.0.2.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] + ], + "versions": [ + "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" ] } ], - "timestamp": "2026-02-18T23:06:22.194395", + "timestamp": "2026-03-11T12:20:35.403767", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index 6d92340..f6dd127 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -55,7 +55,7 @@ process FEATURES_COMBINE_JSON { cat <<-END_VERSIONS > versions.yml ${task.process}: - features_combine_json: $(features_combine_json --version 2>/dev/null | head -n 1) + features_combine_json: \$(features_combine_json --version 2>/dev/null | head -n 1) END_VERSIONS """ @@ -63,18 +63,12 @@ process FEATURES_COMBINE_JSON { """ set -euo pipefail - test_data_dir="${moduleDir}/tests/data" - out_json="${meta.id}.features.json" test -s "${json_manifest}" - mode="seq_region" agp_path="${agp}" agp_name="\${agp_path##*/}" - if [[ "\$agp_name" != "NO_FILE" ]]; then - mode="agp" - fi manifest_real="\$(python -c 'from pathlib import Path; import sys; print(Path(sys.argv[1]).resolve())' "${json_manifest}")" manifest_dir="\$(dirname "\$manifest_real")" @@ -92,7 +86,6 @@ process FEATURES_COMBINE_JSON { exit 1 fi - if grep -q '"ncrna_features"' "\$first_json"; then load_type="ncrna" elif grep -q '"repeat_features"' "\$first_json"; then @@ -103,25 +96,38 @@ process FEATURES_COMBINE_JSON { exit 1 fi - # Provide a schema-valid combined JSON fixture. - # Fixtures are arranged under: - # tests/data/repeat/seq_region/output/.features.json - # tests/data/repeat/agp/output/.features.json - # tests/data/ncrna/seq_region/output/.features.json - # tests/data/ncrna/agp/output/.features.json - fixture="\$test_data_dir/\$load_type/\$mode/output/${meta.id}.features.json" - - if [[ ! -s "\$fixture" ]]; then - echo "ERROR: missing stub fixture: \$fixture" >&2 - echo "Make sure you created output fixture for meta.id='${meta.id}' under \$load_type/\$mode/output/." >&2 - exit 1 + if [[ "\$load_type" == "repeat" ]]; then + cat > "\$out_json" <<-EOF +{ + "analysis": { + "logic_name": "stub_repeat" + }, + "source": { + "source_provider": "stub" + }, + "repeat_consensus": [], + "repeat_features": [] +} +EOF + else + cat > "\$out_json" <<-EOF +{ + "analysis": { + "logic_name": "stub_ncrna" + }, + "source": { + "source_provider": "stub" + }, + "ncrna_tool": "stub", + "ncrna_features": [] + } +EOF fi - cp "\$fixture" "\$out_json" - cat <<-END_VERSIONS > versions.yml ${task.process}: - features_combine_json: $(features_combine_json --version 2>/dev/null | head -n 1) + features_combine_json: stub END_VERSIONS """ + } diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/.DS_Store deleted file mode 100644 index 46ebb6833f86b8f68ba1c38bb8339c81f2e59428..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKQBK1!40Xzciul9hFNr(5>V37D*$i=vk3a~F9-B&0EUc75f+HkP@smoe8q4Y4!h5OA!Ab1aB{LT=E*C&d_!@v zI_zHD$puAgjR9j|$iRg@j=BEdpT7SOgY3x|Fb2+w0XJ%9%@nVcYisdxTx&h_0m{OD lNpT;7j+A2faw)!nioou<0}L6HA}kR55fB=zF$VsWfp4#8P9XpQ diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json deleted file mode 100644 index d1701a4..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/inputs/in.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "comp1", - "seq_region_start": 10, - "seq_region_end": 20, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt deleted file mode 100644 index 1ac93e6..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/manifest.txt +++ /dev/null @@ -1 +0,0 @@ -inputs/in.json diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json deleted file mode 100644 index 3479d90..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/output/test.features.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1", - "seq_region_start": 109, - "seq_region_end": 119, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp b/modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp deleted file mode 100644 index 86dddab..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/agp/test.agp +++ /dev/null @@ -1 +0,0 @@ -chr1 100 199 1 W comp1 1 100 + diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/.DS_Store deleted file mode 100644 index fe6726456fbd1a5c01fb943de0bb1315c8847206..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ5Iwu5S>XZWRxZ)<%)tnH!zVoK`wwoB!U!UOCcSFkH9Io0Rb*U#&r&+1p7X5D>H5~e^NwdSCi z^xvcL)!}$IkI6?TkIo!b{hh}#pMD>>_-f?aM+Xk6^^Z;g%j(YF^}Jw z{e;54gX5&`#D!tB!9Xz3XJBlXQ>p(q$It)$ptuSKf`OA_fU|N@&hbcITL%v(wKhR- pp(5hf8ty~T(NYXwDa99171)z>fQ4gg2n!@O0!l+P!N8v~@C8=rP3-^x diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json deleted file mode 100644 index 82f7bb1..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/a.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1_1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json deleted file mode 100644 index d6c2349..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/inputs/b.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1_4", - "seq_region_start": 1, - "seq_region_end": 2, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt deleted file mode 100644 index 419c5fd..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/a.json -inputs/b.json diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json deleted file mode 100644 index 995f408..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/custom_regex/output/test.features.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - }, - { - "seq_region": "chr1", - "seq_region_start": 4, - "seq_region_end": 5, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store b/modules/ensembl/features/combine_json/tests/data/ncrna/order/.DS_Store deleted file mode 100644 index 66ff2cf9ee6c96bb1913fed247a844fe61ae8b34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ5Iwu5S>XZWJHsaax41Wz(nQ*xc~~02vUMALHez!C^!Q*;0&}>oQF3b1v@69 zLnxY&X5V^tW<7pudv=M4XWQkJXhK9KRB&{P<_D2+(SeNIB7>}RJkrDZb~?XlHtoPW z4F8bwo^f4n`fUf(Z=KFeQy+Ew>GNtgn8G@65E z(tnS}7yIMgJf1!~d35I3G~am~^6BRx-$6()5DWwZN6!Fiwn%wq7;P{R37S8a)C43z${V*F6+4i2+zSWpN5D_nT)q-eFL^)J&a*pN)k#W(HOxz-ataCikus}!J)b>)yl&nzy%=-9KlVX6ga#b$zNM2h<4=1%YL2sZU n;@27OL(tJu3|}e57f==0lXQTEV`m5pBsKy{Lo~s_pEB?TBx_B{ diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json deleted file mode 100644 index 8cee59d..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/a.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1_chunk_start_1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json deleted file mode 100644 index a76e76e..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/inputs/b.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1_chunk_start_4", - "seq_region_start": 1, - "seq_region_end": 2, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt deleted file mode 100644 index cada44b..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/a.json -inputs/b.json \ No newline at end of file diff --git a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json deleted file mode 100644 index 995f408..0000000 --- a/modules/ensembl/features/combine_json/tests/data/ncrna/seq_region/output/test.features.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "cmscan", - "display_label": "cmscan", - "description": "cmscan analysis", - "program": "test", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "ncrna_tool": "cmscan", - "ncrna_features": [ - { - "seq_region": "chr1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - }, - { - "seq_region": "chr1", - "seq_region_start": 4, - "seq_region_end": 5, - "seq_region_strand": 1, - "biotype": "miRNA", - "score": 1.0, - "target_name": "MIRTEST", - "is_significant": true - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/agp/inputs/in.json b/modules/ensembl/features/combine_json/tests/data/repeat/agp/inputs/in.json deleted file mode 100644 index 8228fd3..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/agp/inputs/in.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "comp1", - "seq_region_start": 10, - "seq_region_end": 20, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 11, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/agp/manifest.txt b/modules/ensembl/features/combine_json/tests/data/repeat/agp/manifest.txt deleted file mode 100644 index 1ac93e6..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/agp/manifest.txt +++ /dev/null @@ -1 +0,0 @@ -inputs/in.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/agp/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/repeat/agp/output/test.features.json deleted file mode 100644 index cfc4cd2..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/agp/output/test.features.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1", - "seq_region_start": 109, - "seq_region_end": 119, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 11, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/agp/test.agp b/modules/ensembl/features/combine_json/tests/data/repeat/agp/test.agp deleted file mode 100644 index 86dddab..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/agp/test.agp +++ /dev/null @@ -1 +0,0 @@ -chr1 100 199 1 W comp1 1 100 + diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/inputs/in.json b/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/inputs/in.json deleted file mode 100644 index 69bfad7..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/inputs/in.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1_11", - "seq_region_start": 1, - "seq_region_end": 5, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 5, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/manifest.txt b/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/manifest.txt deleted file mode 100644 index 1ac93e6..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/manifest.txt +++ /dev/null @@ -1 +0,0 @@ -inputs/in.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/output/test.features.json deleted file mode 100644 index 0d8eff4..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/custom_regex/output/test.features.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1", - "seq_region_start": 11, - "seq_region_end": 15, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 5, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/01.json b/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/01.json deleted file mode 100644 index 269ac0b..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/01.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr2_chunk_start_1", - "seq_region_start": 1, - "seq_region_end": 2, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 2, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/02.json b/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/02.json deleted file mode 100644 index 8256fd2..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/order/inputs/02.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr2_chunk_start_3", - "seq_region_start": 1, - "seq_region_end": 1, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 1, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/order/manifest.txt b/modules/ensembl/features/combine_json/tests/data/repeat/order/manifest.txt deleted file mode 100644 index dad42b0..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/order/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/02.json -inputs/01.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/order/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/repeat/order/output/test.features.json deleted file mode 100644 index 0442952..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/order/output/test.features.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr2", - "seq_region_start": 3, - "seq_region_end": 3, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 1, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - }, - { - "seq_region": "chr2", - "seq_region_start": 1, - "seq_region_end": 2, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 2, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json deleted file mode 100644 index b33f05c..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/a.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1_chunk_start_1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 3, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json deleted file mode 100644 index e6787cb..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/inputs/b.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1_chunk_start_4", - "seq_region_start": 1, - "seq_region_end": 2, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 2, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt deleted file mode 100644 index 419c5fd..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/manifest.txt +++ /dev/null @@ -1,2 +0,0 @@ -inputs/a.json -inputs/b.json diff --git a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json b/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json deleted file mode 100644 index c69532b..0000000 --- a/modules/ensembl/features/combine_json/tests/data/repeat/seq_region/output/test.features.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "analysis": { - "run_date": "2026-02-18T00:00:00Z", - "logic_name": "rm", - "display_label": "rm", - "description": "rm analysis (nf-test)", - "program": "stub", - "program_version": "0.0" - }, - "source": { - "source_provider": "prov", - "is_primary": true - }, - "repeat_consensus": [ - { - "repeat_consensus_key": "58bc82baa00a592e0b49f526b80a7c89", - "repeat_name": "Alu", - "repeat_class": "SINE", - "repeat_type": "Alu", - "repeat_consensus": "ACGT" - } - ], - "repeat_features": [ - { - "seq_region": "chr1", - "seq_region_start": 1, - "seq_region_end": 3, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 3, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - }, - { - "seq_region": "chr1", - "seq_region_start": 4, - "seq_region_end": 5, - "seq_region_strand": 1, - "repeat_start": 1, - "repeat_end": 2, - "repeat_consensus": "58bc82baa00a592e0b49f526b80a7c89" - } - ] -} diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test index 153f75b..4557ff1 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -32,53 +32,20 @@ nextflow_process { process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Stub outputs: ncRNA seq_region mode") { - - when { - options "-stub" + def inJson = file("in.json") + inJson.text = '{"repeat_features": []}\\n' - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Stub outputs: repeat AGP mode") { + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" - when { - options "-stub" + def noFile = file("NO_FILE") + noFile.text = "" - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/agp/manifest.txt'), - file('${moduleDir}/tests/data/repeat/agp/test.agp')] + input[0] = [ + [ id:'test' ], + manifest, + noFile + ] """ } } @@ -91,56 +58,27 @@ nextflow_process { } } - test("Stub outputs: ncRNA AGP mode") { + test("Stub outputs: ncRNA seq_region mode") { when { options "-stub" process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/agp/manifest.txt'), - file('${moduleDir}/tests/data/ncrna/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: repeat seq_region combine + seq_region-driven liftover") { - - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/seq_region/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } + def inJson = file("in.json") + inJson.text = '{"ncrna_features": [], "ncrna_tool": "cmscan"}\\n' - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" - test("Real run: ncRNA seq_region combine + seq_region-driven liftover") { + def noFile = file("NO_FILE") + noFile.text = "" - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/seq_region/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] + input[0] = [ + [ id:'test' ], + manifest, + noFile + ] """ } } @@ -153,56 +91,27 @@ nextflow_process { } } - test("Real run: repeat AGP-driven liftover") { + test("Stub outputs: repeat AGP mode") { when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/agp/manifest.txt'), - file('${moduleDir}/tests/data/repeat/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: ncRNA AGP-driven liftover") { + options "-stub" - when { process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/agp/manifest.txt'), - file('${moduleDir}/tests/data/ncrna/agp/test.agp')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } + def inJson = file("in.json") + inJson.text = '{"repeat_features": []}\\n' - test("Real run: repeat custom chunk regex") { + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" - when { - params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' + def agp = file("test.agp") + agp.text = "" - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/custom_regex/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] + input[0] = [ + [ id:'test' ], + manifest, + agp + ] """ } } @@ -215,57 +124,27 @@ nextflow_process { } } - test("Real run: ncRNA custom chunk regex") { + test("Stub outputs: ncRNA AGP mode") { when { - // Matches the ncRNA custom_regex inputs: _ - params.chunk_id_regex = '^(?P.+)_(?P\\d+)$' - - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/custom_regex/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } - - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } - - test("Real run: repeat manifest order is preserved") { + options "-stub" - when { process { """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/repeat/order/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] - """ - } - } + def inJson = file("in.json") + inJson.text = '{"ncrna_features": [], "ncrna_tool": "cmscan"}\\n' - then { - assert process.trace.tasks().size() == 1 - assert process.out.combined_json.size() == 1 - assert process.success - assert snapshot(process.out).match() - } - } + def manifest = file("manifest.txt") + manifest.text = "in.json\\n" - test("Real run: ncRNA manifest order is preserved") { + def agp = file("test.agp") + agp.text = "" - when { - process { - """ - input[0] = [[ id:'test' ], - file('${moduleDir}/tests/data/ncrna/order/manifest.txt'), - file('${workflow.projectDir}/modules/assets/NO_FILE')] + input[0] = [ + [ id:'test' ], + manifest, + agp + ] """ } } @@ -277,4 +156,4 @@ nextflow_process { assert snapshot(process.out).match() } } -} +} \ No newline at end of file diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 9ff7391..0ad5cd4 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -1,5 +1,5 @@ { - "Real run: AGP-driven liftover": { + "Stub outputs: repeat AGP mode": { "content": [ { "0": [ @@ -7,182 +7,26 @@ { "id": "test" }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ] - } - ], - "timestamp": "2026-02-23T17:54:02.625791", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: ncRNA custom chunk regex": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" - ] + "1": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ], "combined_json": [ [ { "id": "test" }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" - ] - ] - } - ], - "timestamp": "2026-02-23T19:15:58.553743", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: header combine + header-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] + "versions": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ] } ], - "timestamp": "2026-02-23T17:54:00.401674", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: repeat custom chunk regex": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ] - } - ], - "timestamp": "2026-02-23T18:38:58.589502", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: repeat manifest order is preserved": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ] - } - ], - "timestamp": "2026-02-23T18:39:03.129965", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: ncRNA header mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" - ] - ] - } - ], - "timestamp": "2026-02-23T19:21:33.771238", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: repeat seq_region combine + seq_region-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-23T23:31:17.929825", + "timestamp": "2026-03-11T12:27:17.171188", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -196,74 +40,26 @@ { "id": "test" }, - "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,556a240063931bcbba8ee21d6efc373d" - ] - ] - } - ], - "timestamp": "2026-02-23T23:32:28.865106", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: ncRNA header combine + header-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" - ] + "1": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ], "combined_json": [ [ { "id": "test" }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" - ] - ] - } - ], - "timestamp": "2026-02-23T19:15:49.744214", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: ncRNA seq_region combine + seq_region-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,4c10f64659bc581612383e3afece97fb" - ] + "versions": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ] } ], - "timestamp": "2026-02-23T23:31:20.204864", + "timestamp": "2026-03-11T12:27:15.074952", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -277,263 +73,26 @@ { "id": "test" }, - "test.features.json:md5,435c4d8f4008e57685ff951bbe81df0e" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,435c4d8f4008e57685ff951bbe81df0e" - ] - ] - } - ], - "timestamp": "2026-02-23T19:21:38.112104", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: ncRNA manifest order is preserved": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,c5b36cf499f0d111684f91372469154f" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,c5b36cf499f0d111684f91372469154f" - ] - ] - } - ], - "timestamp": "2026-02-23T19:16:02.962026", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: repeat AGP mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ] - } - ], - "timestamp": "2026-02-23T19:21:35.954494", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: AGP mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] + "1": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ], "combined_json": [ [ { "id": "test" }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] - ] - } - ], - "timestamp": "2026-02-23T17:53:58.199351", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: header mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] + "versions": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ] } ], - "timestamp": "2026-02-23T17:53:56.112251", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: ncRNA AGP-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,837dcba57ebd00c1b8adbce528b8f1b0" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,837dcba57ebd00c1b8adbce528b8f1b0" - ] - ] - } - ], - "timestamp": "2026-02-23T19:15:54.146861", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: repeat header combine + header-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-23T18:38:49.606314", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: custom chunk regex": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,f410544c71be74f7a8a7eab5e494b258" - ] - ] - } - ], - "timestamp": "2026-02-23T17:54:04.861554", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Stub outputs: repeat header mode": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-23T19:21:31.584701", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: manifest order is preserved": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,1b68c1371265dad11839769a5e776b33" - ] - ] - } - ], - "timestamp": "2026-02-23T17:54:07.074875", + "timestamp": "2026-03-11T12:27:19.259793", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -547,47 +106,26 @@ { "id": "test" }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] ], - "combined_json": [ - [ - { - "id": "test" - }, - "test.features.json:md5,007a5710a0037aae8f907d13cde08f77" - ] - ] - } - ], - "timestamp": "2026-02-23T23:32:26.754167", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.3" - } - }, - "Real run: repeat AGP-driven liftover": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" - ] + "1": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ], "combined_json": [ [ { "id": "test" }, - "test.features.json:md5,5fc5a0cd8050982334ada4bca1a55950" + "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] + ], + "versions": [ + "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" ] } ], - "timestamp": "2026-02-23T18:38:54.140158", + "timestamp": "2026-03-11T12:27:12.976715", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" From 1405de94787e7e5b96902bebebf3b29550b8278d Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Mon, 16 Mar 2026 15:09:12 +0000 Subject: [PATCH 23/36] Move dynamic memory allocation to pipeline --- modules/ensembl/fasta/split/main.nf | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 4a33e00..2855857 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -21,8 +21,6 @@ process FASTA_SPLIT { conda "${moduleDir}/environment.yml" container "ensemblorg/ensembl-genomio:v1.6.1" - memory { fasta_split_mem(longest_seq_bp) } - input: tuple val(meta), path(fasta), val(longest_seq_bp) @@ -122,14 +120,3 @@ process FASTA_SPLIT { END_VERSIONS """ } - - -def fasta_split_mem(longest_seq_bp) { - if( !longest_seq_bp || longest_seq_bp <= 0 ) return 8.GB - - // Heuristic: ~2.5 bytes/base peak => ~1 GB per 400 Mbp of the *longest* sequence - // Add 2GB base memory to account for overhead - def mem_gb = 2 + Math.ceil(longest_seq_bp as double / 400_000_000d) - return mem_gb.GB -} - From b00d0ac6b7aa451d0e0ddb484a99e20296b5fd3e Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 20 Mar 2026 16:14:22 +0000 Subject: [PATCH 24/36] Update output filename --- modules/ensembl/features/combine_json/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index f6dd127..1ebd9c1 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -25,7 +25,7 @@ process FEATURES_COMBINE_JSON { tuple val(meta), path(json_manifest), path(agp) output: - tuple val(meta), path("${meta.id}.features.json"), emit: combined_json + tuple val(meta), path("${meta.id}.${meta.analysis}.json"), emit: combined_json path "versions.yml", emit: versions script: @@ -45,7 +45,7 @@ process FEATURES_COMBINE_JSON { args << "--agp-file '${agp}'" } - def out_json = "${meta.id}.features.json" + def out_json = "${meta.id}.${meta.analysis}.json" """ features_combine_json \\ @@ -63,7 +63,7 @@ process FEATURES_COMBINE_JSON { """ set -euo pipefail - out_json="${meta.id}.features.json" + out_json="${meta.id}.${meta.analysis}.json" test -s "${json_manifest}" From 78e789d0cdc62bfcb75591f27f444965be4ac9fe Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 12 May 2026 13:36:47 +0100 Subject: [PATCH 25/36] Add meta files --- modules/ensembl/fasta/recombine/meta.yml | 56 ++++++++++++++++ modules/ensembl/fasta/split/meta.yml | 64 +++++++++++++++++++ .../ensembl/features/combine_json/meta.yml | 56 ++++++++++++++++ .../features/combine_json/tests/main.nf.test | 10 +-- .../combine_json/tests/main.nf.test.snap | 26 +++++--- 5 files changed, 198 insertions(+), 14 deletions(-) create mode 100644 modules/ensembl/fasta/recombine/meta.yml create mode 100644 modules/ensembl/fasta/split/meta.yml create mode 100644 modules/ensembl/features/combine_json/meta.yml diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml new file mode 100644 index 0000000..98bc4b6 --- /dev/null +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -0,0 +1,56 @@ +--- +name: "fasta_recombine" +description: Recombine split FASTA sequences into a single FASTA file, + optionally using an AGP file. +keywords: + - ensembl + - fasta + - genomics + - genomio + - recombine + +tools: + - "fasta_recombine": + description: "Recombine split FASTA sequences generated by ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: ["Apache License version 2.0"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - fasta_manifest: + type: file + description: Manifest file listing split FASTA files to recombine. + pattern: "*.txt" + ontologies: [] + - agp: + type: file + description: + Optional AGP file describing how split sequence chunks should + be recombined. Use NO_FILE when not required. + pattern: "*.{agp,NO_FILE}" + ontologies: [] +output: + recombined_fasta: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - "${meta.id}.fa": + type: file + description: Recombined FASTA file. + pattern: "*.fa" + versions: + - versions.yml: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml new file mode 100644 index 0000000..59ca0a8 --- /dev/null +++ b/modules/ensembl/fasta/split/meta.yml @@ -0,0 +1,64 @@ +--- +name: "fasta_split" +description: Split a FASTA file into smaller FASTA files and optionally write an + AGP file. +keywords: + - ensembl + - fasta + - genomics + - genomio + - split + +tools: + - "fasta_split": + description: "Split FASTA files with ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: ["Apache License version 2.0"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - fasta: + type: file + description: FASTA file to split. + pattern: "*.{fa,fasta,fna}" + ontologies: [] + - longest_seq_bp: + type: integer + description: Length in base pairs of the longest sequence in the input + FASTA. + +output: + fastas: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - "splits/**/*.fa": + type: file + description: Split FASTA files. + pattern: "splits/**/*.fa" + agp: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1' ]` + - "splits/*.agp": + type: file + description: Optional AGP file describing split sequence chunks. + pattern: "splits/*.agp" + versions: + - versions.yml: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml new file mode 100644 index 0000000..750559d --- /dev/null +++ b/modules/ensembl/features/combine_json/meta.yml @@ -0,0 +1,56 @@ +--- +name: "features_combine_json" +description: Combine split feature JSON files into a single JSON file, + optionally using an AGP file. +keywords: + - ensembl + - features + - genomics + - genomio + - json + +tools: + - "features_combine_json": + description: "Combine split feature JSON files generated by ensembl-genomio." + homepage: "https://github.com/Ensembl/ensembl-genomio" + licence: ["Apache License version 2.0"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1', analysis:'repeat' ]` + - json_manifest: + type: file + description: Manifest file listing split JSON files to combine. + pattern: "*.txt" + ontologies: [] + - agp: + type: file + description: + Optional AGP file describing how split sequence chunks should + be recombined. Use NO_FILE when not required. + pattern: "*.{agp,NO_FILE}" + ontologies: [] +output: + combined_json: + - - meta: + type: map + description: | + Groovy Map containing meta information + e.g. `[ id:'accession1', analysis:'repeat' ]` + - "${meta.id}.${meta.analysis}.json": + type: file + description: Combined feature JSON file. + pattern: "*.json" + versions: + - versions.yml: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "ensembl-dev@ebi.ac.uk" +maintainers: + - "ensembl-dev@ebi.ac.uk" diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test index 4557ff1..cb61d27 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -42,7 +42,7 @@ nextflow_process { noFile.text = "" input[0] = [ - [ id:'test' ], + [ id:'test', analysis:'features' ], manifest, noFile ] @@ -75,7 +75,7 @@ nextflow_process { noFile.text = "" input[0] = [ - [ id:'test' ], + [ id:'test', analysis:'features' ], manifest, noFile ] @@ -108,7 +108,7 @@ nextflow_process { agp.text = "" input[0] = [ - [ id:'test' ], + [ id:'test', analysis:'features' ], manifest, agp ] @@ -141,7 +141,7 @@ nextflow_process { agp.text = "" input[0] = [ - [ id:'test' ], + [ id:'test', analysis:'features' ], manifest, agp ] @@ -156,4 +156,4 @@ nextflow_process { assert snapshot(process.out).match() } } -} \ No newline at end of file +} diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 0ad5cd4..238ac28 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -5,7 +5,8 @@ "0": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -16,7 +17,8 @@ "combined_json": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -38,7 +40,8 @@ "0": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -49,7 +52,8 @@ "combined_json": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -71,7 +75,8 @@ "0": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -82,7 +87,8 @@ "combined_json": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -104,7 +110,8 @@ "0": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -115,7 +122,8 @@ "combined_json": [ [ { - "id": "test" + "id": "test", + "analysis": "features" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -131,4 +139,4 @@ "nextflow": "25.10.3" } } -} \ No newline at end of file +} From 2883bc08cef4fb12b21abbea7420c3b25a23ca34 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 12 May 2026 13:59:04 +0100 Subject: [PATCH 26/36] Linting updates --- .../ensembl/fasta/recombine/environment.yml | 3 +- modules/ensembl/fasta/recombine/main.nf | 19 ++-- modules/ensembl/fasta/recombine/meta.yml | 24 ++++- .../fasta/recombine/tests/main.nf.test | 6 +- .../fasta/recombine/tests/main.nf.test.snap | 36 +++++-- modules/ensembl/fasta/split/environment.yml | 3 +- modules/ensembl/fasta/split/main.nf | 18 ++-- modules/ensembl/fasta/split/meta.yml | 24 ++++- .../ensembl/fasta/split/tests/main.nf.test | 8 +- .../fasta/split/tests/main.nf.test.snap | 100 ++++++++++++------ .../features/combine_json/environment.yml | 3 +- modules/ensembl/features/combine_json/main.nf | 18 ++-- .../ensembl/features/combine_json/meta.yml | 24 ++++- .../combine_json/tests/main.nf.test.snap | 66 +++++++++--- 14 files changed, 232 insertions(+), 120 deletions(-) diff --git a/modules/ensembl/fasta/recombine/environment.yml b/modules/ensembl/fasta/recombine/environment.yml index 52b218c..94089f3 100644 --- a/modules/ensembl/fasta/recombine/environment.yml +++ b/modules/ensembl/fasta/recombine/environment.yml @@ -1,7 +1,6 @@ --- -name: "fasta_recombine" channels: - conda-forge - bioconda dependencies: - - ensembl-genomio=1.6.1 \ No newline at end of file + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 057c98f..1e2e88c 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -19,14 +19,19 @@ process FASTA_RECOMBINE { label 'process_medium' conda "${moduleDir}/environment.yml" - container "ensemblorg/ensembl-genomio:v1.6.1" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: tuple val(meta), path(fasta_manifest), path(agp) output: tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta - path "versions.yml", emit: versions + tuple val("${task.process}"), val('fasta_recombine'), eval('echo 1.6.1'), emit: versions_fasta_recombine, topic: versions + + when: + task.ext.when == null || task.ext.when script: def args = [] @@ -52,11 +57,6 @@ process FASTA_RECOMBINE { --fasta-manifest ${fasta_manifest} \\ --out-fasta ${out_fasta} \\ ${args.join(' ')} - - cat <<-END_VERSIONS > versions.yml - ${task.process}: - fasta_recombine: \$(fasta_recombine --version 2>/dev/null | head -n 1) - END_VERSIONS """ stub: @@ -65,10 +65,5 @@ process FASTA_RECOMBINE { out_fa="${meta.id}.fa" touch "\$out_fa" - - cat <<-END_VERSIONS > versions.yml - ${task.process}: - fasta_recombine: stub - END_VERSIONS """ } diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml index 98bc4b6..8583e3e 100644 --- a/modules/ensembl/fasta/recombine/meta.yml +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -45,11 +45,27 @@ output: type: file description: Recombined FASTA file. pattern: "*.fa" + versions_fasta_recombine: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_recombine: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. +topics: versions: - - versions.yml: - type: file - description: File containing software versions. - pattern: "versions.yml" + - - ${task.process}: + type: string + description: The name of the process. + - fasta_recombine: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index 9a7a6c9..4448cf7 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -25,7 +25,7 @@ nextflow_process { tag "fasta" tag "fasta/recombine" - test("Stub outputs: header mode") { + test("stub outputs: header mode") { when { options "-stub" @@ -55,7 +55,7 @@ nextflow_process { } } - test("Stub outputs: AGP mode") { + test("stub outputs: AGP mode") { when { options "-stub" @@ -83,4 +83,4 @@ nextflow_process { assert snapshot(process.out).match() } } -} \ No newline at end of file +} diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 3c98f07..2ad0719 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -1,5 +1,5 @@ { - "Stub outputs: AGP mode": { + "stub outputs: AGP mode": { "content": [ { "0": [ @@ -11,7 +11,11 @@ ] ], "1": [ - "versions.yml:md5,191cc20355b504364a619df6b4c639aa" + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.1" + ] ], "recombined_fasta": [ [ @@ -21,18 +25,22 @@ "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "versions": [ - "versions.yml:md5,191cc20355b504364a619df6b4c639aa" + "versions_fasta_recombine": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:11.373089", + "timestamp": "2026-05-12T13:50:13.249443", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" } }, - "Stub outputs: header mode": { + "stub outputs: header mode": { "content": [ { "0": [ @@ -44,7 +52,11 @@ ] ], "1": [ - "versions.yml:md5,191cc20355b504364a619df6b4c639aa" + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.1" + ] ], "recombined_fasta": [ [ @@ -54,12 +66,16 @@ "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "versions": [ - "versions.yml:md5,191cc20355b504364a619df6b4c639aa" + "versions_fasta_recombine": [ + [ + "FASTA_RECOMBINE", + "fasta_recombine", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:09.308095", + "timestamp": "2026-05-12T13:50:11.167936", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/fasta/split/environment.yml b/modules/ensembl/fasta/split/environment.yml index 208dc35..94089f3 100644 --- a/modules/ensembl/fasta/split/environment.yml +++ b/modules/ensembl/fasta/split/environment.yml @@ -1,7 +1,6 @@ --- -name: "fasta_split" channels: - conda-forge - bioconda dependencies: - - ensembl-genomio=1.6.1 \ No newline at end of file + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 2855857..2e3acc5 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -19,7 +19,9 @@ process FASTA_SPLIT { label 'process_medium' conda "${moduleDir}/environment.yml" - container "ensemblorg/ensembl-genomio:v1.6.1" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: tuple val(meta), path(fasta), val(longest_seq_bp) @@ -27,7 +29,10 @@ process FASTA_SPLIT { output: tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true - path "versions.yml", emit: versions + tuple val("${task.process}"), val('fasta_split'), eval('echo 1.6.1'), emit: versions_fasta_split, topic: versions + + when: + task.ext.when == null || task.ext.when script: def args = [] @@ -73,11 +78,6 @@ process FASTA_SPLIT { --fasta-file ${fasta} \\ --out-dir splits \\ ${args.join(' ')} - - cat <<-END_VERSIONS > versions.yml - ${task.process}: - fasta_split: \$(fasta_split --version 2>/dev/null | head -n 1) - END_VERSIONS """ stub: @@ -114,9 +114,5 @@ process FASTA_SPLIT { touch "splits/${meta.id}.agp" fi - cat <<-END_VERSIONS > versions.yml - ${task.process}: - fasta_split: stub - END_VERSIONS """ } diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml index 59ca0a8..96303dd 100644 --- a/modules/ensembl/fasta/split/meta.yml +++ b/modules/ensembl/fasta/split/meta.yml @@ -53,11 +53,27 @@ output: type: file description: Optional AGP file describing split sequence chunks. pattern: "splits/*.agp" + versions_fasta_split: + - - ${task.process}: + type: string + description: The name of the process. + - fasta_split: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. +topics: versions: - - versions.yml: - type: file - description: File containing software versions. - pattern: "versions.yml" + - - ${task.process}: + type: string + description: The name of the process. + - fasta_split: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test index 5aa3acf..042ff9c 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -26,7 +26,7 @@ nextflow_process { tag "fasta/split" - test("Stub outputs: default layout, no AGP") { + test("stub outputs: default layout, no AGP") { when { options "-stub" @@ -67,7 +67,7 @@ nextflow_process { } } - test("Stub outputs: AGP optional output appears when enabled") { + test("stub outputs: AGP optional output appears when enabled") { when { options "-stub" @@ -109,7 +109,7 @@ nextflow_process { } } - test("Stub outputs: unique_file_names contract") { + test("stub outputs: unique_file_names contract") { when { options "-stub" @@ -144,7 +144,7 @@ nextflow_process { } } - test("Stub outputs: nested directory layout contract") { + test("stub outputs: nested directory layout contract") { when { options "-stub" diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index d736a2a..9914c59 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -1,5 +1,5 @@ { - "Stub outputs: AGP optional output appears when enabled": { + "stub outputs: default layout, no AGP": { "content": [ { "0": [ @@ -14,23 +14,17 @@ ] ], "1": [ - [ - { - "id": "test" - }, - "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + ], "2": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" - ], - "agp": [ [ - { - "id": "test" - }, - "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" + "FASTA_SPLIT", + "fasta_split", + "1.6.1" ] + ], + "agp": [ + ], "fastas": [ [ @@ -43,18 +37,22 @@ ] ] ], - "versions": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:33.334793", + "timestamp": "2026-05-12T13:50:15.366293", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" } }, - "Stub outputs: nested directory layout contract": { + "stub outputs: AGP optional output appears when enabled": { "content": [ { "0": [ @@ -69,13 +67,27 @@ ] ], "1": [ - + [ + { + "id": "test" + }, + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ], "2": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ], "agp": [ - + [ + { + "id": "test" + }, + "test.agp:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ], "fastas": [ [ @@ -88,18 +100,22 @@ ] ] ], - "versions": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:37.504172", + "timestamp": "2026-05-12T13:50:17.499614", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" } }, - "Stub outputs: default layout, no AGP": { + "stub outputs: nested directory layout contract": { "content": [ { "0": [ @@ -117,7 +133,11 @@ ], "2": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ], "agp": [ @@ -133,18 +153,22 @@ ] ] ], - "versions": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:31.268587", + "timestamp": "2026-05-12T13:50:21.755317", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" } }, - "Stub outputs: unique_file_names contract": { + "stub outputs: unique_file_names contract": { "content": [ { "0": [ @@ -162,7 +186,11 @@ ], "2": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ], "agp": [ @@ -178,12 +206,16 @@ ] ] ], - "versions": [ - "versions.yml:md5,f9829a6851db178766a8ce7426f53a65" + "versions_fasta_split": [ + [ + "FASTA_SPLIT", + "fasta_split", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:20:35.403767", + "timestamp": "2026-05-12T13:50:19.618244", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/features/combine_json/environment.yml b/modules/ensembl/features/combine_json/environment.yml index 5f1cb32..94089f3 100644 --- a/modules/ensembl/features/combine_json/environment.yml +++ b/modules/ensembl/features/combine_json/environment.yml @@ -1,7 +1,6 @@ --- -name: "features_combine_json" channels: - conda-forge - bioconda dependencies: - - ensembl-genomio=1.6.1 \ No newline at end of file + - ensembl-genomio=1.6.1 diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index 1ebd9c1..a44767e 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -19,14 +19,19 @@ process FEATURES_COMBINE_JSON { label 'process_medium' conda "${moduleDir}/environment.yml" - container "ensemblorg/ensembl-genomio:v1.6.1" + container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' + : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: tuple val(meta), path(json_manifest), path(agp) output: tuple val(meta), path("${meta.id}.${meta.analysis}.json"), emit: combined_json - path "versions.yml", emit: versions + tuple val("${task.process}"), val('features_combine_json'), eval('echo 1.6.1'), emit: versions_features_combine_json, topic: versions + + when: + task.ext.when == null || task.ext.when script: def args = [] @@ -52,11 +57,6 @@ process FEATURES_COMBINE_JSON { --json-manifest '${json_manifest}' \\ --out-json '${out_json}' \\ ${args.join(' ')} - - cat <<-END_VERSIONS > versions.yml - ${task.process}: - features_combine_json: \$(features_combine_json --version 2>/dev/null | head -n 1) - END_VERSIONS """ stub: @@ -124,10 +124,6 @@ EOF EOF fi - cat <<-END_VERSIONS > versions.yml - ${task.process}: - features_combine_json: stub - END_VERSIONS """ } diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml index 750559d..e524e75 100644 --- a/modules/ensembl/features/combine_json/meta.yml +++ b/modules/ensembl/features/combine_json/meta.yml @@ -45,11 +45,27 @@ output: type: file description: Combined feature JSON file. pattern: "*.json" + versions_features_combine_json: + - - ${task.process}: + type: string + description: The name of the process. + - features_combine_json: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. +topics: versions: - - versions.yml: - type: file - description: File containing software versions. - pattern: "versions.yml" + - - ${task.process}: + type: string + description: The name of the process. + - features_combine_json: + type: string + description: The name of the tool. + - echo 1.6.1: + type: eval + description: The expression to obtain the version of the tool. authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 238ac28..986d31b 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -12,7 +12,11 @@ ] ], "1": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ], "combined_json": [ [ @@ -23,12 +27,16 @@ "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] ], - "versions": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:27:17.171188", + "timestamp": "2026-05-12T13:52:12.162809", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -47,7 +55,11 @@ ] ], "1": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ], "combined_json": [ [ @@ -58,12 +70,16 @@ "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "versions": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:27:15.074952", + "timestamp": "2026-05-12T13:52:09.797407", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -82,7 +98,11 @@ ] ], "1": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ], "combined_json": [ [ @@ -93,12 +113,16 @@ "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] ], - "versions": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:27:19.259793", + "timestamp": "2026-05-12T13:52:14.52976", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -117,7 +141,11 @@ ] ], "1": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ], "combined_json": [ [ @@ -128,15 +156,19 @@ "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] ], - "versions": [ - "versions.yml:md5,8258695d6a28a46edbaf2a9bf2dde339" + "versions_features_combine_json": [ + [ + "FEATURES_COMBINE_JSON", + "features_combine_json", + "1.6.1" + ] ] } ], - "timestamp": "2026-03-11T12:27:12.976715", + "timestamp": "2026-05-12T13:52:07.471915", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" } } -} +} \ No newline at end of file From 6c07eb1fb50f7ee4c8d0c001800ba59fb884eeef Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 10:42:24 +0100 Subject: [PATCH 27/36] Code review update --- modules/assets/NO_FILE | 0 modules/ensembl/fasta/recombine/main.nf | 22 ++++++++++++++--- modules/ensembl/fasta/recombine/meta.yml | 4 ++-- .../fasta/recombine/tests/main.nf.test | 2 -- modules/ensembl/fasta/split/main.nf | 23 ++++++++++++++---- modules/ensembl/fasta/split/meta.yml | 4 ++-- .../ensembl/fasta/split/tests/main.nf.test | 2 -- modules/ensembl/features/combine_json/main.nf | 23 ++++++++++++++---- .../ensembl/features/combine_json/meta.yml | 4 ++-- requirements-dev.txt | 2 -- tests/config/nextflow.config | 2 +- tests/conftest.py | 24 ------------------- 12 files changed, 64 insertions(+), 48 deletions(-) delete mode 100644 modules/assets/NO_FILE delete mode 100644 requirements-dev.txt delete mode 100644 tests/conftest.py diff --git a/modules/assets/NO_FILE b/modules/assets/NO_FILE deleted file mode 100644 index e69de29..0000000 diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 1e2e88c..4992c35 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -13,6 +13,21 @@ // See the License for the specific language governing permissions and // limitations under the License. +params.ensembl_genomio_version_cmd = ''' +python - <<'PY' +from importlib.metadata import distributions + +print(next( + ( + dist.version + for dist in distributions() + if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" + ), + "unknown", +)) +PY +'''.stripIndent() + process FASTA_RECOMBINE { tag "${meta.id}" @@ -28,7 +43,10 @@ process FASTA_RECOMBINE { output: tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta - tuple val("${task.process}"), val('fasta_recombine'), eval('echo 1.6.1'), emit: versions_fasta_recombine, topic: versions + tuple val("${task.process}"), + val('fasta_recombine'), + eval(params.ensembl_genomio_version_cmd), + emit: versions_fasta_recombine, topic: versions when: task.ext.when == null || task.ext.when @@ -61,8 +79,6 @@ process FASTA_RECOMBINE { stub: """ - set -euo pipefail - out_fa="${meta.id}.fa" touch "\$out_fa" """ diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml index 8583e3e..c28be5b 100644 --- a/modules/ensembl/fasta/recombine/meta.yml +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -52,7 +52,7 @@ output: - fasta_recombine: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. topics: @@ -63,7 +63,7 @@ topics: - fasta_recombine: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. authors: diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index 4448cf7..a0c650d 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -20,8 +20,6 @@ nextflow_process { script "../main.nf" process "FASTA_RECOMBINE" - tag "modules" - tag "modules_ensembl" tag "fasta" tag "fasta/recombine" diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 2e3acc5..7e22745 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,8 +13,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -process FASTA_SPLIT { +params.ensembl_genomio_version_cmd = ''' +python - <<'PY' +from importlib.metadata import distributions + +print(next( + ( + dist.version + for dist in distributions() + if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" + ), + "unknown", +)) +PY +'''.stripIndent() +process FASTA_SPLIT { tag "${meta.id}" label 'process_medium' @@ -29,7 +43,10 @@ process FASTA_SPLIT { output: tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true - tuple val("${task.process}"), val('fasta_split'), eval('echo 1.6.1'), emit: versions_fasta_split, topic: versions + tuple val("${task.process}"), + val('fasta_split'), + eval(params.ensembl_genomio_version_cmd), + emit: versions_fasta_split, topic: versions when: task.ext.when == null || task.ext.when @@ -82,8 +99,6 @@ process FASTA_SPLIT { stub: """ - set -euo pipefail - layout="default" if [[ "${params.unique_file_names ?: false}" == "true" ]]; then layout="unique" diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml index 96303dd..90e45ca 100644 --- a/modules/ensembl/fasta/split/meta.yml +++ b/modules/ensembl/fasta/split/meta.yml @@ -60,7 +60,7 @@ output: - fasta_split: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. topics: @@ -71,7 +71,7 @@ topics: - fasta_split: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. authors: diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test index 042ff9c..5906c14 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -20,8 +20,6 @@ nextflow_process { script "../main.nf" process "FASTA_SPLIT" - tag "modules" - tag "modules_ensembl" tag "fasta" tag "fasta/split" diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index a44767e..8e55063 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -13,8 +13,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -process FEATURES_COMBINE_JSON { +params.ensembl_genomio_version_cmd = ''' +python - <<'PY' +from importlib.metadata import distributions + +print(next( + ( + dist.version + for dist in distributions() + if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" + ), + "unknown", +)) +PY +'''.stripIndent() +process FEATURES_COMBINE_JSON { tag "${meta.id}" label 'process_medium' @@ -28,7 +42,10 @@ process FEATURES_COMBINE_JSON { output: tuple val(meta), path("${meta.id}.${meta.analysis}.json"), emit: combined_json - tuple val("${task.process}"), val('features_combine_json'), eval('echo 1.6.1'), emit: versions_features_combine_json, topic: versions + tuple val("${task.process}"), + val('features_combine_json'), + eval(params.ensembl_genomio_version_cmd), + emit: versions_features_combine_json, topic: versions when: task.ext.when == null || task.ext.when @@ -61,8 +78,6 @@ process FEATURES_COMBINE_JSON { stub: """ - set -euo pipefail - out_json="${meta.id}.${meta.analysis}.json" test -s "${json_manifest}" diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml index e524e75..e14d694 100644 --- a/modules/ensembl/features/combine_json/meta.yml +++ b/modules/ensembl/features/combine_json/meta.yml @@ -52,7 +52,7 @@ output: - features_combine_json: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. topics: @@ -63,7 +63,7 @@ topics: - features_combine_json: type: string description: The name of the tool. - - echo 1.6.1: + - params.ensembl_genomio_version_cmd: type: eval description: The expression to obtain the version of the tool. authors: diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index c0367d2..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,2 +0,0 @@ -biopython -pytest \ No newline at end of file diff --git a/tests/config/nextflow.config b/tests/config/nextflow.config index a527e1f..e4c8606 100644 --- a/tests/config/nextflow.config +++ b/tests/config/nextflow.config @@ -16,5 +16,5 @@ includeConfig 'test_data.config' singularity { - enabled = false + enabled = true } diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 766dbc3..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,24 +0,0 @@ -import importlib.util -from pathlib import Path - -import pytest - - -@pytest.fixture(scope="session") -def split_fasta_module(): - """ - Load modules/ensembl/fasta/splitfasta/split_fasta.py as a Python module - regardless of whether 'modules/' is a Python package. - """ - repo_root = Path(__file__).resolve().parents[1] - module_path = ( - repo_root / "modules" / "ensembl" / "fasta" / "splitfasta" / "split_fasta.py" - ) - - spec = importlib.util.spec_from_file_location("split_fasta", module_path) - if spec is None or spec.loader is None: - raise RuntimeError(f"Could not load module spec from {module_path}") - - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - return mod From 4e6e53f7d23b39b11cf9dd5d4335be06ef40815e Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 10:49:00 +0100 Subject: [PATCH 28/36] Remove blank line --- modules/ensembl/fasta/recombine/main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 4992c35..1cb042f 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -69,7 +69,6 @@ process FASTA_RECOMBINE { } def out_fasta = "${meta.id}.fa" - """ fasta_recombine \\ --fasta-manifest ${fasta_manifest} \\ From a4d48549e1ead62b963cdf13d0c2720efefad99f Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 13:56:32 +0100 Subject: [PATCH 29/36] Use single line version cmd for param --- modules/ensembl/fasta/recombine/main.nf | 15 +-------------- modules/ensembl/fasta/split/main.nf | 15 +-------------- modules/ensembl/features/combine_json/main.nf | 15 +-------------- 3 files changed, 3 insertions(+), 42 deletions(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 1cb042f..2aa07d7 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -13,20 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = ''' -python - <<'PY' -from importlib.metadata import distributions - -print(next( - ( - dist.version - for dist in distributions() - if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" - ), - "unknown", -)) -PY -'''.stripIndent() +params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" process FASTA_RECOMBINE { diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 7e22745..b17f539 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,20 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = ''' -python - <<'PY' -from importlib.metadata import distributions - -print(next( - ( - dist.version - for dist in distributions() - if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" - ), - "unknown", -)) -PY -'''.stripIndent() +params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" process FASTA_SPLIT { tag "${meta.id}" diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index 8e55063..9bed496 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -13,20 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = ''' -python - <<'PY' -from importlib.metadata import distributions - -print(next( - ( - dist.version - for dist in distributions() - if dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio" - ), - "unknown", -)) -PY -'''.stripIndent() +params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" process FEATURES_COMBINE_JSON { tag "${meta.id}" From 59e24c4c3ac26cf93a9b71235a5b228329c1b389 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 14:18:52 +0100 Subject: [PATCH 30/36] Use command directly within eval --- modules/ensembl/fasta/recombine/main.nf | 4 +--- modules/ensembl/fasta/split/main.nf | 4 +--- modules/ensembl/features/combine_json/main.nf | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 2aa07d7..2d86da7 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -13,8 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" - process FASTA_RECOMBINE { tag "${meta.id}" @@ -32,7 +30,7 @@ process FASTA_RECOMBINE { tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta tuple val("${task.process}"), val('fasta_recombine'), - eval(params.ensembl_genomio_version_cmd), + eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_recombine, topic: versions when: diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index b17f539..b287f6a 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -13,8 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" - process FASTA_SPLIT { tag "${meta.id}" label 'process_medium' @@ -32,7 +30,7 @@ process FASTA_SPLIT { tuple val(meta), path("splits/*.agp"), emit: agp, optional: true tuple val("${task.process}"), val('fasta_split'), - eval(params.ensembl_genomio_version_cmd), + eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_split, topic: versions when: diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index 9bed496..bac3f91 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -13,8 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -params.ensembl_genomio_version_cmd = "python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'" - process FEATURES_COMBINE_JSON { tag "${meta.id}" label 'process_medium' @@ -31,7 +29,7 @@ process FEATURES_COMBINE_JSON { tuple val(meta), path("${meta.id}.${meta.analysis}.json"), emit: combined_json tuple val("${task.process}"), val('features_combine_json'), - eval(params.ensembl_genomio_version_cmd), + eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_features_combine_json, topic: versions when: From 30961a3db2c7f09efa988594194a6dd7f0c821d8 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 14:40:24 +0100 Subject: [PATCH 31/36] Update snapshots --- .../fasta/recombine/tests/main.nf.test.snap | 12 +++++----- .../fasta/split/tests/main.nf.test.snap | 24 +++++++++---------- .../combine_json/tests/main.nf.test.snap | 24 +++++++++---------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 2ad0719..3f33a6d 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -14,7 +14,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.1" + "1.6.2" ] ], "recombined_fasta": [ @@ -29,12 +29,12 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:13.249443", + "timestamp": "2026-05-14T14:39:11.350698", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -55,7 +55,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.1" + "1.6.2" ] ], "recombined_fasta": [ @@ -70,12 +70,12 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:11.167936", + "timestamp": "2026-05-14T14:39:09.216174", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index 9914c59..ebe20c3 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -20,7 +20,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ], "agp": [ @@ -41,12 +41,12 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:15.366293", + "timestamp": "2026-05-14T14:38:41.602246", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -78,7 +78,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ], "agp": [ @@ -104,12 +104,12 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:17.499614", + "timestamp": "2026-05-14T14:38:43.765608", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -136,7 +136,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ], "agp": [ @@ -157,12 +157,12 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:21.755317", + "timestamp": "2026-05-14T14:38:48.132705", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -189,7 +189,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ], "agp": [ @@ -210,12 +210,12 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:50:19.618244", + "timestamp": "2026-05-14T14:38:45.953655", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 986d31b..5850f06 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -15,7 +15,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ], "combined_json": [ @@ -31,12 +31,12 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:52:12.162809", + "timestamp": "2026-05-14T14:39:29.784572", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -58,7 +58,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ], "combined_json": [ @@ -74,12 +74,12 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:52:09.797407", + "timestamp": "2026-05-14T14:39:27.607529", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -101,7 +101,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ], "combined_json": [ @@ -117,12 +117,12 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:52:14.52976", + "timestamp": "2026-05-14T14:39:31.963829", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" @@ -144,7 +144,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ], "combined_json": [ @@ -160,12 +160,12 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.1" + "1.6.2" ] ] } ], - "timestamp": "2026-05-12T13:52:07.471915", + "timestamp": "2026-05-14T14:39:25.403423", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.3" From b771306996e506b30787a1e59852b934a9c897bc Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Thu, 14 May 2026 16:38:48 +0100 Subject: [PATCH 32/36] Linting fixes --- modules/ensembl/fasta/recombine/main.nf | 7 ++-- modules/ensembl/fasta/recombine/meta.yml | 27 ++++++++------ .../fasta/recombine/tests/main.nf.test | 2 ++ modules/ensembl/fasta/split/main.nf | 7 ++-- modules/ensembl/fasta/split/meta.yml | 31 +++++++++------- .../ensembl/fasta/split/tests/main.nf.test | 2 ++ modules/ensembl/features/combine_json/main.nf | 15 ++++---- .../ensembl/features/combine_json/meta.yml | 35 ++++++++++++------- .../features/combine_json/tests/main.nf.test | 12 ++++--- .../combine_json/tests/main.nf.test.snap | 24 +++++-------- 10 files changed, 86 insertions(+), 76 deletions(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 2d86da7..0c8afa9 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -21,17 +21,14 @@ process FASTA_RECOMBINE { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' - : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: tuple val(meta), path(fasta_manifest), path(agp) output: tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta - tuple val("${task.process}"), - val('fasta_recombine'), - eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), - emit: versions_fasta_recombine, topic: versions + tuple val("${task.process}"), val('fasta_recombine'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_recombine, topic: versions when: task.ext.when == null || task.ext.when diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml index c28be5b..2836455 100644 --- a/modules/ensembl/fasta/recombine/meta.yml +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -1,4 +1,3 @@ ---- name: "fasta_recombine" description: Recombine split FASTA sequences into a single FASTA file, optionally using an AGP file. @@ -8,14 +7,13 @@ keywords: - genomics - genomio - recombine - tools: - "fasta_recombine": description: "Recombine split FASTA sequences generated by ensembl-genomio." homepage: "https://github.com/Ensembl/ensembl-genomio" - licence: ["Apache License version 2.0"] + licence: + - "Apache License version 2.0" identifier: "" - input: - - meta: type: map @@ -41,10 +39,11 @@ output: description: | Groovy Map containing meta information e.g. `[ id:'accession1' ]` - - "${meta.id}.fa": + - ${meta.id}.fa: type: file description: Recombined FASTA file. pattern: "*.fa" + ontologies: [] versions_fasta_recombine: - - ${task.process}: type: string @@ -52,9 +51,12 @@ output: - fasta_recombine: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool topics: versions: - - ${task.process}: @@ -63,9 +65,12 @@ topics: - fasta_recombine: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test b/modules/ensembl/fasta/recombine/tests/main.nf.test index a0c650d..4448cf7 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test @@ -20,6 +20,8 @@ nextflow_process { script "../main.nf" process "FASTA_RECOMBINE" + tag "modules" + tag "modules_ensembl" tag "fasta" tag "fasta/recombine" diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index b287f6a..9b62fc0 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -20,7 +20,7 @@ process FASTA_SPLIT { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' - : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: tuple val(meta), path(fasta), val(longest_seq_bp) @@ -28,10 +28,7 @@ process FASTA_SPLIT { output: tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true - tuple val("${task.process}"), - val('fasta_split'), - eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), - emit: versions_fasta_split, topic: versions + tuple val("${task.process}"), val('fasta_split'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_split, topic: versions when: task.ext.when == null || task.ext.when diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml index 90e45ca..6617745 100644 --- a/modules/ensembl/fasta/split/meta.yml +++ b/modules/ensembl/fasta/split/meta.yml @@ -1,4 +1,3 @@ ---- name: "fasta_split" description: Split a FASTA file into smaller FASTA files and optionally write an AGP file. @@ -8,14 +7,13 @@ keywords: - genomics - genomio - split - tools: - "fasta_split": description: "Split FASTA files with ensembl-genomio." homepage: "https://github.com/Ensembl/ensembl-genomio" - licence: ["Apache License version 2.0"] + licence: + - "Apache License version 2.0" identifier: "" - input: - - meta: type: map @@ -31,7 +29,6 @@ input: type: integer description: Length in base pairs of the longest sequence in the input FASTA. - output: fastas: - - meta: @@ -39,20 +36,22 @@ output: description: | Groovy Map containing meta information e.g. `[ id:'accession1' ]` - - "splits/**/*.fa": + - splits/**/*.fa: type: file description: Split FASTA files. pattern: "splits/**/*.fa" + ontologies: [] agp: - - meta: type: map description: | Groovy Map containing meta information e.g. `[ id:'accession1' ]` - - "splits/*.agp": + - splits/*.agp: type: file description: Optional AGP file describing split sequence chunks. pattern: "splits/*.agp" + ontologies: [] versions_fasta_split: - - ${task.process}: type: string @@ -60,9 +59,12 @@ output: - fasta_split: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool topics: versions: - - ${task.process}: @@ -71,9 +73,12 @@ topics: - fasta_split: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/fasta/split/tests/main.nf.test b/modules/ensembl/fasta/split/tests/main.nf.test index 5906c14..042ff9c 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test +++ b/modules/ensembl/fasta/split/tests/main.nf.test @@ -20,6 +20,8 @@ nextflow_process { script "../main.nf" process "FASTA_SPLIT" + tag "modules" + tag "modules_ensembl" tag "fasta" tag "fasta/split" diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index bac3f91..ebe2f7e 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -20,17 +20,14 @@ process FEATURES_COMBINE_JSON { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ensembl-genomio:1.6.1--pyhdfd78af_0' - : 'biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" + : 'quay.io/biocontainers/ensembl-genomio:1.6.1--pyhdfd78af_0'}" input: - tuple val(meta), path(json_manifest), path(agp) + tuple val(meta), val(analysis), path(json_manifest), path(agp) output: - tuple val(meta), path("${meta.id}.${meta.analysis}.json"), emit: combined_json - tuple val("${task.process}"), - val('features_combine_json'), - eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), - emit: versions_features_combine_json, topic: versions + tuple val(meta), path("${meta.id}.${analysis}.json"), emit: combined_json + tuple val("${task.process}"), val('features_combine_json'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_features_combine_json, topic: versions when: task.ext.when == null || task.ext.when @@ -52,7 +49,7 @@ process FEATURES_COMBINE_JSON { args << "--agp-file '${agp}'" } - def out_json = "${meta.id}.${meta.analysis}.json" + def out_json = "${meta.id}.${analysis}.json" """ features_combine_json \\ @@ -63,7 +60,7 @@ process FEATURES_COMBINE_JSON { stub: """ - out_json="${meta.id}.${meta.analysis}.json" + out_json="${meta.id}.${analysis}.json" test -s "${json_manifest}" diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml index e14d694..4558ebe 100644 --- a/modules/ensembl/features/combine_json/meta.yml +++ b/modules/ensembl/features/combine_json/meta.yml @@ -1,4 +1,3 @@ ---- name: "features_combine_json" description: Combine split feature JSON files into a single JSON file, optionally using an AGP file. @@ -8,20 +7,22 @@ keywords: - genomics - genomio - json - tools: - "features_combine_json": description: "Combine split feature JSON files generated by ensembl-genomio." homepage: "https://github.com/Ensembl/ensembl-genomio" - licence: ["Apache License version 2.0"] + licence: + - "Apache License version 2.0" identifier: "" - input: - - meta: type: map description: | Groovy Map containing meta information - e.g. `[ id:'accession1', analysis:'repeat' ]` + e.g. `[ id:'accession1' ]` + - analysis: + type: string + description: Analysis name to include in the combined JSON filename. - json_manifest: type: file description: Manifest file listing split JSON files to combine. @@ -40,11 +41,13 @@ output: type: map description: | Groovy Map containing meta information - e.g. `[ id:'accession1', analysis:'repeat' ]` - - "${meta.id}.${meta.analysis}.json": + e.g. `[ id:'accession1' ]` + - ${meta.id}.${analysis}.json: type: file description: Combined feature JSON file. pattern: "*.json" + ontologies: + - edam: http://edamontology.org/format_3464 versions_features_combine_json: - - ${task.process}: type: string @@ -52,9 +55,12 @@ output: - features_combine_json: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool topics: versions: - - ${task.process}: @@ -63,9 +69,12 @@ topics: - features_combine_json: type: string description: The name of the tool. - - params.ensembl_genomio_version_cmd: - type: eval - description: The expression to obtain the version of the tool. + - ? python -c 'from importlib.metadata import distributions; + print(next((dist.version for dist in distributions() if + dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), + "unknown"))' + : type: eval + description: The expression to obtain the version of the tool authors: - "ensembl-dev@ebi.ac.uk" maintainers: diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test b/modules/ensembl/features/combine_json/tests/main.nf.test index cb61d27..eb6af6d 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test +++ b/modules/ensembl/features/combine_json/tests/main.nf.test @@ -42,7 +42,8 @@ nextflow_process { noFile.text = "" input[0] = [ - [ id:'test', analysis:'features' ], + [ id:'test' ], + 'features', manifest, noFile ] @@ -75,7 +76,8 @@ nextflow_process { noFile.text = "" input[0] = [ - [ id:'test', analysis:'features' ], + [ id:'test' ], + 'features', manifest, noFile ] @@ -108,7 +110,8 @@ nextflow_process { agp.text = "" input[0] = [ - [ id:'test', analysis:'features' ], + [ id:'test' ], + 'features', manifest, agp ] @@ -141,7 +144,8 @@ nextflow_process { agp.text = "" input[0] = [ - [ id:'test', analysis:'features' ], + [ id:'test' ], + 'features', manifest, agp ] diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 5850f06..1927537 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -5,8 +5,7 @@ "0": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -21,8 +20,7 @@ "combined_json": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -48,8 +46,7 @@ "0": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -64,8 +61,7 @@ "combined_json": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -91,8 +87,7 @@ "0": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -107,8 +102,7 @@ "combined_json": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,67c630685f9c819ef28574144c284b4e" ] @@ -134,8 +128,7 @@ "0": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] @@ -150,8 +143,7 @@ "combined_json": [ [ { - "id": "test", - "analysis": "features" + "id": "test" }, "test.features.json:md5,aefc84472e26178b64d01051be6d58b2" ] From b50cfba5019afab709012af4f4db8673971f483c Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 15 May 2026 15:31:55 +0100 Subject: [PATCH 33/36] Use package versions --- modules/ensembl/fasta/split/main.nf | 2 +- modules/ensembl/features/combine_json/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ensembl/fasta/split/main.nf b/modules/ensembl/fasta/split/main.nf index 9b62fc0..da643c7 100644 --- a/modules/ensembl/fasta/split/main.nf +++ b/modules/ensembl/fasta/split/main.nf @@ -28,7 +28,7 @@ process FASTA_SPLIT { output: tuple val(meta), path("splits/**/*.fa"), emit: fastas tuple val(meta), path("splits/*.agp"), emit: agp, optional: true - tuple val("${task.process}"), val('fasta_split'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_split, topic: versions + tuple val("${task.process}"), val('fasta_split'), eval("fasta_split --version"), emit: versions_fasta_split, topic: versions when: task.ext.when == null || task.ext.when diff --git a/modules/ensembl/features/combine_json/main.nf b/modules/ensembl/features/combine_json/main.nf index ebe2f7e..e366038 100644 --- a/modules/ensembl/features/combine_json/main.nf +++ b/modules/ensembl/features/combine_json/main.nf @@ -27,7 +27,7 @@ process FEATURES_COMBINE_JSON { output: tuple val(meta), path("${meta.id}.${analysis}.json"), emit: combined_json - tuple val("${task.process}"), val('features_combine_json'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_features_combine_json, topic: versions + tuple val("${task.process}"), val('features_combine_json'), eval("features_combine_json --version"), emit: versions_features_combine_json, topic: versions when: task.ext.when == null || task.ext.when From adb281acd7c44be67f8753149c7e20e90a8acc41 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 15 May 2026 15:39:37 +0100 Subject: [PATCH 34/36] Update meta.yml --- modules/ensembl/fasta/split/meta.yml | 10 ++-------- modules/ensembl/features/combine_json/meta.yml | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/modules/ensembl/fasta/split/meta.yml b/modules/ensembl/fasta/split/meta.yml index 6617745..a00fcba 100644 --- a/modules/ensembl/fasta/split/meta.yml +++ b/modules/ensembl/fasta/split/meta.yml @@ -59,10 +59,7 @@ output: - fasta_split: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? fasta_split --version : type: eval description: The expression to obtain the version of the tool topics: @@ -73,10 +70,7 @@ topics: - fasta_split: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? fasta_split --version : type: eval description: The expression to obtain the version of the tool authors: diff --git a/modules/ensembl/features/combine_json/meta.yml b/modules/ensembl/features/combine_json/meta.yml index 4558ebe..049fa1a 100644 --- a/modules/ensembl/features/combine_json/meta.yml +++ b/modules/ensembl/features/combine_json/meta.yml @@ -55,10 +55,7 @@ output: - features_combine_json: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? features_combine_json --version : type: eval description: The expression to obtain the version of the tool topics: @@ -69,10 +66,7 @@ topics: - features_combine_json: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? features_combine_json --version : type: eval description: The expression to obtain the version of the tool authors: From a14b6c572c368a9187aa2ca27b61de1b5080746b Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 15 May 2026 16:24:34 +0100 Subject: [PATCH 35/36] Bump genomio version in snapshots --- .../fasta/recombine/tests/main.nf.test.snap | 8 ++++---- .../ensembl/fasta/split/tests/main.nf.test.snap | 16 ++++++++-------- .../combine_json/tests/main.nf.test.snap | 16 ++++++++-------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap index 3f33a6d..0357999 100644 --- a/modules/ensembl/fasta/recombine/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/recombine/tests/main.nf.test.snap @@ -14,7 +14,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.2" + "1.6.3" ] ], "recombined_fasta": [ @@ -29,7 +29,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.2" + "1.6.3" ] ] } @@ -55,7 +55,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.2" + "1.6.3" ] ], "recombined_fasta": [ @@ -70,7 +70,7 @@ [ "FASTA_RECOMBINE", "fasta_recombine", - "1.6.2" + "1.6.3" ] ] } diff --git a/modules/ensembl/fasta/split/tests/main.nf.test.snap b/modules/ensembl/fasta/split/tests/main.nf.test.snap index ebe20c3..07ec1d6 100644 --- a/modules/ensembl/fasta/split/tests/main.nf.test.snap +++ b/modules/ensembl/fasta/split/tests/main.nf.test.snap @@ -20,7 +20,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ], "agp": [ @@ -41,7 +41,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ] } @@ -78,7 +78,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ], "agp": [ @@ -104,7 +104,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ] } @@ -136,7 +136,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ], "agp": [ @@ -157,7 +157,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ] } @@ -189,7 +189,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ], "agp": [ @@ -210,7 +210,7 @@ [ "FASTA_SPLIT", "fasta_split", - "1.6.2" + "1.6.3" ] ] } diff --git a/modules/ensembl/features/combine_json/tests/main.nf.test.snap b/modules/ensembl/features/combine_json/tests/main.nf.test.snap index 1927537..397c8f0 100644 --- a/modules/ensembl/features/combine_json/tests/main.nf.test.snap +++ b/modules/ensembl/features/combine_json/tests/main.nf.test.snap @@ -14,7 +14,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ], "combined_json": [ @@ -29,7 +29,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ] } @@ -55,7 +55,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ], "combined_json": [ @@ -70,7 +70,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ] } @@ -96,7 +96,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ], "combined_json": [ @@ -111,7 +111,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ] } @@ -137,7 +137,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ], "combined_json": [ @@ -152,7 +152,7 @@ [ "FEATURES_COMBINE_JSON", "features_combine_json", - "1.6.2" + "1.6.3" ] ] } From ef8a3ea388d054494cb52888f8496abe9e8d8434 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Tue, 19 May 2026 15:36:53 +0100 Subject: [PATCH 36/36] Update versioning for fasta_recombine --- modules/ensembl/fasta/recombine/main.nf | 2 +- modules/ensembl/fasta/recombine/meta.yml | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/modules/ensembl/fasta/recombine/main.nf b/modules/ensembl/fasta/recombine/main.nf index 0c8afa9..d14d71b 100644 --- a/modules/ensembl/fasta/recombine/main.nf +++ b/modules/ensembl/fasta/recombine/main.nf @@ -28,7 +28,7 @@ process FASTA_RECOMBINE { output: tuple val(meta), path("${meta.id}.fa"), emit: recombined_fasta - tuple val("${task.process}"), val('fasta_recombine'), eval("python -c 'from importlib.metadata import distributions; print(next((dist.version for dist in distributions() if dist.metadata[\"Name\"].lower().replace(\"_\", \"-\") == \"ensembl-genomio\"), \"unknown\"))'"), emit: versions_fasta_recombine, topic: versions + tuple val("${task.process}"), val('fasta_recombine'), eval("fasta_recombine --version"), emit: versions_fasta_recombine, topic: versions when: task.ext.when == null || task.ext.when diff --git a/modules/ensembl/fasta/recombine/meta.yml b/modules/ensembl/fasta/recombine/meta.yml index 2836455..7c7aec4 100644 --- a/modules/ensembl/fasta/recombine/meta.yml +++ b/modules/ensembl/fasta/recombine/meta.yml @@ -51,10 +51,7 @@ output: - fasta_recombine: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? fasta_recombine --version : type: eval description: The expression to obtain the version of the tool topics: @@ -65,10 +62,7 @@ topics: - fasta_recombine: type: string description: The name of the tool. - - ? python -c 'from importlib.metadata import distributions; - print(next((dist.version for dist in distributions() if - dist.metadata["Name"].lower().replace("_", "-") == "ensembl-genomio"), - "unknown"))' + - ? fasta_recombine --version : type: eval description: The expression to obtain the version of the tool authors: