From 2977c01bceb68e6f59d777ab5c51578fa12fd065 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Fri, 20 Feb 2026 11:08:42 +0100 Subject: [PATCH 01/14] Refactor RMSD analyses into MDAnalysis AnaysisBase classes --- src/openfe_analysis/rmsd.py | 234 ++++++++++++++++++++++-------------- 1 file changed, 144 insertions(+), 90 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index b566ba1..808339c 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -5,11 +5,9 @@ import MDAnalysis as mda import netCDF4 as nc import numpy as np -import tqdm from MDAnalysis.analysis import rms -from MDAnalysis.lib.mdamath import make_whole +from MDAnalysis.analysis.base import AnalysisBase from MDAnalysis.transformations import unwrap -from numpy import typing as npt from .reader import FEReader from .transformations import Aligner, ClosestImageShift, NoJump @@ -99,6 +97,139 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers return u +class ProteinRMSD(AnalysisBase): + """Calculate the 1D protein RMSD""" + + def __init__(self, atomgroup, **kwargs): + super(ProteinRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) + + self._ag = atomgroup + + def _prepare(self): + """ + Set the first frame as the reference + """ + self.results.rmsd = [] + self._reference = self._ag.positions + + def _single_frame(self): + rmsd = rms.rmsd( + self._ag.positions, + self._reference, + center=False, + superposition=False, + ) + self.results.rmsd.append(rmsd) + + def _conclude(self): + self.results.rmsd = np.asarray(self.results.rmsd) + + +class Protein2DRMSD(AnalysisBase): + """ + Flattened 2D RMSD matrix + + For all unique frame pairs ``(i, j)`` with ``i < j``, this function + computes the RMSD between atomic coordinates after optimal alignment. + """ + + def __init__(self, atomgroup, weights=None, **kwargs): + """ + Parameters + ---------- + atomgroup: AtomGroup + Protein atoms (e.g. CA selection) + weights: np.ndarray, optional + Per-atom weights to use in the RMSD calculation. If ``None``, + all atoms are weighted equally. + """ + super(Protein2DRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) + + self._weights = weights + self._ag = atomgroup + + def _prepare(self): + self._coords = [] + self.results.rmsd2d = [] + + def _single_frame(self): + self._coords.append(self._ag.positions) + + def _conclude(self): + positions = np.asarray(self._coords) + nframes, _, _ = positions.shape + + output = [] + for i, j in itertools.combinations(range(nframes), 2): + posi, posj = positions[i], positions[j] + rmsd = rms.rmsd( + posi, + posj, + self._weights, + center=True, + superposition=True, + ) + output.append(rmsd) + + self.results.rmsd2d = np.asarray(output) + + +class LigandRMSD(AnalysisBase): + """ + 1D RMSD time series for a ligand AtomGroup. + """ + + def __init__(self, atomgroup, **kwargs): + super(LigandRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) + + self._ag = atomgroup + + def _prepare(self): + self.results.rmsd = [] + self._reference = self._ag.positions + self._weights = self._ag.masses / np.mean(self._ag.masses) + + def _single_frame(self): + rmsd = rms.rmsd( + self._ag.positions, + self._reference, + self._weights, + center=False, + superposition=False, + ) + self.results.rmsd.append(rmsd) + + def _conclude(self): + self.results.rmsd = np.asarray(self.results.rmsd) + + +class LigandCOMDrift(AnalysisBase): + """ + Ligand center-of-mass displacement from initial position. + """ + + def __init__(self, atomgroup, **kwargs): + super(LigandCOMDrift, self).__init__(atomgroup.universe.trajectory, **kwargs) + + self._ag = atomgroup + + def _prepare(self): + self.results.com_drift = [] + self._initial_com = self._ag.center_of_mass() + + def _single_frame(self): + # distance between start and current ligand position + # ignores PBC, but we've already centered the traj + drift = mda.lib.distances.calc_bonds( + self._ag.center_of_mass(), + self._initial_com, + ) + self.results.com_drift.append(drift) + + def _conclude(self): + self.results.com_drift = np.asarray(self.results.com_drift) + + def gather_rms_data( pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None ) -> dict[str, list[float]]: @@ -159,8 +290,6 @@ def gather_rms_data( # max against 1 to avoid skip=0 case skip = max(n_frames // 500, 1) - pb = tqdm.tqdm(total=int(n_frames / skip) * n_lambda) - u_top = mda.Universe(pdb_topology) for i in range(n_lambda): @@ -171,93 +300,18 @@ def gather_rms_data( prot = u.select_atoms("protein and name CA") ligand = u.select_atoms("resname UNK") - # save coordinates for 2D RMSD matrix - # TODO: Some smart guard to avoid allocating a silly amount of memory? - prot2d = np.empty((len(u.trajectory[::skip]), len(prot), 3), dtype=np.float32) - - prot_start = prot.positions - ligand_start = ligand.positions - ligand_initial_com = ligand.center_of_mass() - ligand_weights = ligand.masses / np.mean(ligand.masses) - - this_protein_rmsd = [] - this_ligand_rmsd = [] - this_ligand_wander = [] - - for ts_i, ts in enumerate(u.trajectory[::skip]): - pb.update() - - if prot: - prot2d[ts_i, :, :] = prot.positions - this_protein_rmsd.append( - rms.rmsd( - prot.positions, - prot_start, - None, # prot_weights, - center=False, - superposition=False, - ) - ) - if ligand: - this_ligand_rmsd.append( - rms.rmsd( - ligand.positions, - ligand_start, - ligand_weights, - center=False, - superposition=False, - ) - ) - this_ligand_wander.append( - # distance between start and current ligand position - # ignores PBC, but we've already centered the traj - mda.lib.distances.calc_bonds(ligand.center_of_mass(), ligand_initial_com) - ) - if prot: - # can ignore weights here as it's all Ca - rmsd2d = twoD_RMSD(prot2d, w=None) # prot_weights) - output["protein_RMSD"].append(this_protein_rmsd) - output["protein_2D_RMSD"].append(rmsd2d) - if ligand: - output["ligand_RMSD"].append(this_ligand_rmsd) - output["ligand_wander"].append(this_ligand_wander) - - output["time(ps)"] = list(np.arange(len(u.trajectory))[::skip] * u.trajectory.dt) - - return output - - -def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]: - """ - Compute a flattened 2D RMSD matrix from a trajectory. - - For all unique frame pairs ``(i, j)`` with ``i < j``, this function - computes the RMSD between atomic coordinates after optimal alignment. - - Parameters - ---------- - positions : np.ndarray - Atomic coordinates for all frames in the trajectory. - w : np.ndarray, optional - Per-atom weights to use in the RMSD calculation. If ``None``, - all atoms are weighted equally. + prot_rmsd = ProteinRMSD(prot).run(step=skip) + output["protein_RMSD"].append(prot_rmsd.results.rmsd) + prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) + output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) - Returns - ------- - list of float - Flattened list of RMSD values corresponding to all frame pairs - ``(i, j)`` with ``i < j``. - """ - nframes, _, _ = positions.shape - - output = [] - - for i, j in itertools.combinations(range(nframes), 2): - posi, posj = positions[i], positions[j] - - rmsd = rms.rmsd(posi, posj, w, center=True, superposition=True) + if ligand: + lig_rmsd = LigandRMSD(ligand).run(step=skip) + output["ligand_RMSD"].append(lig_rmsd.results.rmsd) + lig_com_drift = LigandCOMDrift(ligand).run(step=skip) + output["ligand_wander"].append(lig_com_drift.results.com_drift) - output.append(rmsd) + output["time(ps)"] = np.arange(len(u.trajectory))[::skip] * u.trajectory.dt return output From 4a960a02e3b92beb8d7a448ca8ef87fd8dd115bd Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Mon, 23 Feb 2026 09:52:33 +0100 Subject: [PATCH 02/14] Combine RMSD analyses --- src/openfe_analysis/rmsd.py | 59 +++++++++++++++---------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 808339c..5591da4 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -97,34 +97,6 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers return u -class ProteinRMSD(AnalysisBase): - """Calculate the 1D protein RMSD""" - - def __init__(self, atomgroup, **kwargs): - super(ProteinRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) - - self._ag = atomgroup - - def _prepare(self): - """ - Set the first frame as the reference - """ - self.results.rmsd = [] - self._reference = self._ag.positions - - def _single_frame(self): - rmsd = rms.rmsd( - self._ag.positions, - self._reference, - center=False, - superposition=False, - ) - self.results.rmsd.append(rmsd) - - def _conclude(self): - self.results.rmsd = np.asarray(self.results.rmsd) - - class Protein2DRMSD(AnalysisBase): """ Flattened 2D RMSD matrix @@ -174,20 +146,32 @@ def _conclude(self): self.results.rmsd2d = np.asarray(output) -class LigandRMSD(AnalysisBase): +class RMSDAnalysis(AnalysisBase): """ - 1D RMSD time series for a ligand AtomGroup. + 1D RMSD time series for an AtomGroup. + + Parameters + ---------- + atomgroup : MDAnalysis.AtomGroup + Atoms to compute RMSD for. + mass_weighted : bool, optional + If True, compute mass-weighted RMSD. """ - def __init__(self, atomgroup, **kwargs): - super(LigandRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) + def __init__(self, atomgroup, mass_weighted=False, **kwargs): + super(RMSDAnalysis, self).__init__(atomgroup.universe.trajectory, **kwargs) self._ag = atomgroup + self._mass_weighted = mass_weighted def _prepare(self): self.results.rmsd = [] self._reference = self._ag.positions - self._weights = self._ag.masses / np.mean(self._ag.masses) + + if self._mass_weighted: + self._weights = self._ag.masses / np.mean(self._ag.masses) + else: + self._weights = None def _single_frame(self): rmsd = rms.rmsd( @@ -301,14 +285,19 @@ def gather_rms_data( ligand = u.select_atoms("resname UNK") if prot: - prot_rmsd = ProteinRMSD(prot).run(step=skip) + prot_rmsd = RMSDAnalysis(prot).run(step=skip) output["protein_RMSD"].append(prot_rmsd.results.rmsd) + # prot_rmsd = rms.RMSD(prot).run(step=skip) + # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[2]) prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) if ligand: - lig_rmsd = LigandRMSD(ligand).run(step=skip) + lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) output["ligand_RMSD"].append(lig_rmsd.results.rmsd) + # weight = ligand.masses / np.mean(ligand.masses) + # lig_rmsd = rms.RMSD(ligand, weights=weight).run(step=skip) + # output["ligand_RMSD"].append(lig_rmsd.results.rmsd.T[2]) lig_com_drift = LigandCOMDrift(ligand).run(step=skip) output["ligand_wander"].append(lig_com_drift.results.com_drift) From df017da2a32384213bf6ff0974622d74e43b28f3 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Mon, 2 Mar 2026 15:05:53 +0100 Subject: [PATCH 03/14] Add reference and superposition option --- src/openfe_analysis/rmsd.py | 41 +++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 5591da4..6576100 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -154,19 +154,26 @@ class RMSDAnalysis(AnalysisBase): ---------- atomgroup : MDAnalysis.AtomGroup Atoms to compute RMSD for. + reference: Optional[MDAnalysis.AtomGroup] + Reference AtomGroup. If None, the first mass_weighted : bool, optional If True, compute mass-weighted RMSD. """ - def __init__(self, atomgroup, mass_weighted=False, **kwargs): + def __init__( + self, atomgroup, reference=None, mass_weighted=False, superposition=False, **kwargs + ): super(RMSDAnalysis, self).__init__(atomgroup.universe.trajectory, **kwargs) self._ag = atomgroup + self._reference = reference if reference is not None else self._ag self._mass_weighted = mass_weighted + self._superposition = superposition def _prepare(self): self.results.rmsd = [] - self._reference = self._ag.positions + + self._reference_pos = self._reference.positions if self._mass_weighted: self._weights = self._ag.masses / np.mean(self._ag.masses) @@ -176,10 +183,10 @@ def _prepare(self): def _single_frame(self): rmsd = rms.rmsd( self._ag.positions, - self._reference, + self._reference_pos, self._weights, center=False, - superposition=False, + superposition=self._superposition, ) self.results.rmsd.append(rmsd) @@ -287,17 +294,33 @@ def gather_rms_data( if prot: prot_rmsd = RMSDAnalysis(prot).run(step=skip) output["protein_RMSD"].append(prot_rmsd.results.rmsd) - # prot_rmsd = rms.RMSD(prot).run(step=skip) - # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[2]) + # # Using the MDAnalysis RMSD class instead + # gs = ["protein and name CA", "protein"] + # prot_rmsd = rms.RMSD( + # u, select="protein and name CA", groupselections=gs, weights="mass") + # prot_rmsd.run(step=skip) + # # The results contain: + # # - frame number + # # - time + # # - RMSD based on select (after superimposing) + # # - RMSD based on groupselections, one array per selection + # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[3]) prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) if ligand: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) output["ligand_RMSD"].append(lig_rmsd.results.rmsd) - # weight = ligand.masses / np.mean(ligand.masses) - # lig_rmsd = rms.RMSD(ligand, weights=weight).run(step=skip) - # output["ligand_RMSD"].append(lig_rmsd.results.rmsd.T[2]) + # # Using the MDAnalysis RMSD class instead + # groupselections = ["resname UNK"] + # lig_rmsd = rms.RMSD( + # u, + # select="protein and name CA", + # groupselections=groupselections, + # weights="mass", + # ) + # lig_rmsd.run(step=skip) + # output["ligand_RMSD"].append(lig_rmsd.results.rmsd.T[3]) lig_com_drift = LigandCOMDrift(ligand).run(step=skip) output["ligand_wander"].append(lig_com_drift.results.com_drift) From d824ab6e65a92a3722dd738958742a1643d08399 Mon Sep 17 00:00:00 2001 From: Hannah Baumann <43765638+hannahbaumann@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:18:04 +0100 Subject: [PATCH 04/14] Apply suggestion from @hannahbaumann --- src/openfe_analysis/rmsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 6576100..f653fcd 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -155,7 +155,7 @@ class RMSDAnalysis(AnalysisBase): atomgroup : MDAnalysis.AtomGroup Atoms to compute RMSD for. reference: Optional[MDAnalysis.AtomGroup] - Reference AtomGroup. If None, the first + Reference AtomGroup. If None, the first frame of the trajectory will be used. mass_weighted : bool, optional If True, compute mass-weighted RMSD. """ From e16b2e34cf2fe03758e7035aab1555045cdcccef Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Mon, 13 Apr 2026 10:59:10 +0200 Subject: [PATCH 05/14] Some changes --- src/openfe_analysis/rmsd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index a0305f4..1c84e35 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -5,7 +5,7 @@ import MDAnalysis as mda import netCDF4 as nc import numpy as np -from MDAnalysis.analysis import rms +from MDAnalysis.analysis import diffusionmap, rms from MDAnalysis.analysis.base import AnalysisBase from MDAnalysis.transformations import unwrap @@ -310,6 +310,9 @@ def gather_rms_data( # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[3]) prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) + prot_rmsd2d = diffusionmap.DistanceMatrix(u, select="protein") + prot_rmsd2d.run(step=skip) + output["protein_2D_RMSD"].append(prot_rmsd2d.results.dist_matrix) if ligand: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) From 616bf42b0f422258c36bf614dbaeb8b7f8a263d0 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Mon, 13 Apr 2026 11:41:49 +0200 Subject: [PATCH 06/14] Fix mda 2D analysis example --- src/openfe_analysis/rmsd.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 1c84e35..1e068fc 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -298,7 +298,7 @@ def gather_rms_data( prot_rmsd = RMSDAnalysis(prot).run(step=skip) output["protein_RMSD"].append(prot_rmsd.results.rmsd) # # Using the MDAnalysis RMSD class instead - # gs = ["protein and name CA", "protein"] + # gs = ["protein and name CA"] # prot_rmsd = rms.RMSD( # u, select="protein and name CA", groupselections=gs, weights="mass") # prot_rmsd.run(step=skip) @@ -308,11 +308,16 @@ def gather_rms_data( # # - RMSD based on select (after superimposing) # # - RMSD based on groupselections, one array per selection # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[3]) + prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) - prot_rmsd2d = diffusionmap.DistanceMatrix(u, select="protein") - prot_rmsd2d.run(step=skip) - output["protein_2D_RMSD"].append(prot_rmsd2d.results.dist_matrix) + # # Using the MDAnalysis DistanceMatrix class + # prot_rmsd2d = diffusionmap.DistanceMatrix(u, select="protein and name CA") + # prot_rmsd2d.run(step=skip) + # dist_mat = prot_rmsd2d.results.dist_matrix + # i, j = np.triu_indices_from(dist_mat, k=1) + # flattened = dist_mat[i, j] + # output["protein_2D_RMSD"].append(flattened) if ligand: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) From 90ed39ef7d000405e09a7f83decd96b92430f7fc Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Mon, 13 Apr 2026 13:40:32 +0200 Subject: [PATCH 07/14] Add RMSD test using mda test data --- environment.yml | 1 + .../tests/test_rmsd_mda_data.py | 80 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 src/openfe_analysis/tests/test_rmsd_mda_data.py diff --git a/environment.yml b/environment.yml index e48a81f..f65c4df 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: dependencies: - click - MDAnalysis + - MDAnalysisTests - netCDF4 - openff-units - pip diff --git a/src/openfe_analysis/tests/test_rmsd_mda_data.py b/src/openfe_analysis/tests/test_rmsd_mda_data.py new file mode 100644 index 0000000..83f95ef --- /dev/null +++ b/src/openfe_analysis/tests/test_rmsd_mda_data.py @@ -0,0 +1,80 @@ +import MDAnalysis as mda +import pytest +from MDAnalysisTests.datafiles import DCD, PSF +from numpy.testing import assert_allclose, assert_almost_equal + +from openfe_analysis.rmsd import RMSDAnalysis + + +@pytest.fixture +def mda_universe(): + return mda.Universe(PSF, DCD) + + +@pytest.fixture() +def correct_values(): + return [0, 4.68953] + + +@pytest.fixture() +def correct_values_mass(): + return [0, 4.74920] + + +def test_rmsd(mda_universe, correct_values): + prot = mda_universe.select_atoms("name CA") + prot_rmsd = RMSDAnalysis(prot, superposition=True).run(step=49) + assert_almost_equal( + prot_rmsd.results.rmsd, + correct_values, + 4, + err_msg="error: rmsd profile should match" + "test values", + ) + + +def test_rmsd_frames(mda_universe, correct_values): + prot = mda_universe.select_atoms("name CA") + prot_rmsd = RMSDAnalysis(prot, superposition=True).run(frames=[0, 49]) + assert_almost_equal( + prot_rmsd.results.rmsd, + correct_values, + 4, + err_msg="error: rmsd profile should match" + "test values", + ) + + +def test_rmsd_single_frame(mda_universe): + prot = mda_universe.select_atoms("name CA") + prot_rmsd = RMSDAnalysis(prot, superposition=True).run(start=5, stop=6) + single_frame = [0.91544906] + assert_almost_equal( + prot_rmsd.results.rmsd, + single_frame, + 4, + err_msg="error: rmsd profile should match" + "test values", + ) + + +def test_mass_weighted(mda_universe, correct_values): + # mass weighting the CA should give the same answer as weighing + # equally because all CA have the same mass + prot = mda_universe.select_atoms("name CA") + prot_rmsd = RMSDAnalysis(prot, superposition=True, mass_weighted=True).run(step=49) + + assert_almost_equal( + prot_rmsd.results.rmsd, + correct_values, + 4, + err_msg="error: rmsd profile should matchtest values", + ) + + +def test_custom_weighted(mda_universe, correct_values_mass): + prot = mda_universe.select_atoms("all") + prot_rmsd = RMSDAnalysis(prot, superposition=True, mass_weighted=True).run(step=49) + assert_almost_equal( + prot_rmsd.results.rmsd, + correct_values_mass, + 4, + err_msg="error: rmsd profile should matchtest values", + ) From 7d6e49fde33281964f76289e283088c40a0a52d6 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Tue, 19 May 2026 09:51:30 +0200 Subject: [PATCH 08/14] Some fixes --- src/openfe_analysis/rmsd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 1e068fc..4b5fb2e 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -127,7 +127,7 @@ def _prepare(self): self.results.rmsd2d = [] def _single_frame(self): - self._coords.append(self._ag.positions) + self._coords.append(self._ag.positions.copy()) def _conclude(self): positions = np.asarray(self._coords) @@ -175,7 +175,7 @@ def __init__( def _prepare(self): self.results.rmsd = [] - self._reference_pos = self._reference.positions + self._reference_pos = self._reference.positions.copy() if self._mass_weighted: self._weights = self._ag.masses / np.mean(self._ag.masses) @@ -335,6 +335,6 @@ def gather_rms_data( lig_com_drift = LigandCOMDrift(ligand).run(step=skip) output["ligand_wander"].append(lig_com_drift.results.com_drift) - output["time(ps)"] = np.arange(len(u.trajectory))[::skip] * u.trajectory.dt + output["time(ps)"] = np.arange(len(u.trajectory))[::skip] * u.trajectory.dt return output From 4bf598f3901e0feccb1d1d4982cc14ede26e05cc Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Tue, 19 May 2026 10:26:24 +0200 Subject: [PATCH 09/14] Some more updates --- src/openfe_analysis/rmsd.py | 41 ++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 4b5fb2e..23c2680 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -117,8 +117,7 @@ def __init__(self, atomgroup, weights=None, **kwargs): Per-atom weights to use in the RMSD calculation. If ``None``, all atoms are weighted equally. """ - super(Protein2DRMSD, self).__init__(atomgroup.universe.trajectory, **kwargs) - + super().__init__(atomgroup.universe.trajectory, **kwargs) self._weights = weights self._ag = atomgroup @@ -131,19 +130,19 @@ def _single_frame(self): def _conclude(self): positions = np.asarray(self._coords) - nframes, _, _ = positions.shape + nframes = positions.shape[0] output = [] for i, j in itertools.combinations(range(nframes), 2): posi, posj = positions[i], positions[j] - rmsd = rms.rmsd( + pair_rmsd = rms.rmsd( posi, posj, self._weights, center=True, superposition=True, ) - output.append(rmsd) + output.append(pair_rmsd) self.results.rmsd2d = np.asarray(output) @@ -157,15 +156,24 @@ class RMSDAnalysis(AnalysisBase): atomgroup : MDAnalysis.AtomGroup Atoms to compute RMSD for. reference: Optional[MDAnalysis.AtomGroup] - Reference AtomGroup. If None, the first frame of the trajectory will be used. + Reference AtomGroup. If ``None``, the reference positions are captured + from the mobile AtomGroup at the start of the run (i.e. whatever frame + the trajectory is on when ``.run()`` is called). mass_weighted : bool, optional If True, compute mass-weighted RMSD. + superposition : bool, optional + If ``True``, perform rotational superposition before computing RMSD. """ def __init__( - self, atomgroup, reference=None, mass_weighted=False, superposition=False, **kwargs + self, + atomgroup, + reference=None, + mass_weighted=False, + superposition=False, + **kwargs, ): - super(RMSDAnalysis, self).__init__(atomgroup.universe.trajectory, **kwargs) + super().__init__(atomgroup.universe.trajectory, **kwargs) self._ag = atomgroup self._reference = reference if reference is not None else self._ag @@ -183,14 +191,14 @@ def _prepare(self): self._weights = None def _single_frame(self): - rmsd = rms.rmsd( + frame_rmsd = rms.rmsd( self._ag.positions, self._reference_pos, self._weights, center=False, superposition=self._superposition, ) - self.results.rmsd.append(rmsd) + self.results.rmsd.append(frame_rmsd) def _conclude(self): self.results.rmsd = np.asarray(self.results.rmsd) @@ -202,8 +210,7 @@ class LigandCOMDrift(AnalysisBase): """ def __init__(self, atomgroup, **kwargs): - super(LigandCOMDrift, self).__init__(atomgroup.universe.trajectory, **kwargs) - + super().__init__(atomgroup.universe.trajectory, **kwargs) self._ag = atomgroup def _prepare(self): @@ -224,7 +231,9 @@ def _conclude(self): def gather_rms_data( - pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None + pdb_topology: pathlib.Path, + dataset: pathlib.Path, + skip: Optional[int] = None, ) -> dict[str, list[float]]: """ Compute structural RMSD-based metrics for a multistate BFE simulation. @@ -286,10 +295,10 @@ def gather_rms_data( u_top = mda.Universe(pdb_topology) - for i in range(n_lambda): + for state_idx in range(n_lambda): # cheeky, but we can read the PDB topology once and reuse per universe # this then only hits the PDB file once for all replicas - u = make_Universe(u_top._topology, ds, state=i) + u = make_Universe(u_top._topology, ds, state=state_idx) prot = u.select_atoms("protein and name CA") ligand = u.select_atoms("resname UNK") @@ -319,7 +328,7 @@ def gather_rms_data( # flattened = dist_mat[i, j] # output["protein_2D_RMSD"].append(flattened) - if ligand: + if ligand.n_atoms > 0: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) output["ligand_RMSD"].append(lig_rmsd.results.rmsd) # # Using the MDAnalysis RMSD class instead From 80b984d82f6a6e24cb43c7bd9dd569194e483de4 Mon Sep 17 00:00:00 2001 From: Hannah Baumann <43765638+hannahbaumann@users.noreply.github.com> Date: Fri, 22 May 2026 09:22:41 +0200 Subject: [PATCH 10/14] Update src/openfe_analysis/rmsd.py Co-authored-by: Irfan Alibay --- src/openfe_analysis/rmsd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 23c2680..7caf3db 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -328,7 +328,7 @@ def gather_rms_data( # flattened = dist_mat[i, j] # output["protein_2D_RMSD"].append(flattened) - if ligand.n_atoms > 0: + if ligand: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) output["ligand_RMSD"].append(lig_rmsd.results.rmsd) # # Using the MDAnalysis RMSD class instead From db61dc4ef7c03fa2eab4b54614d42b051c880af9 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Fri, 22 May 2026 11:02:40 +0200 Subject: [PATCH 11/14] Address review comments --- src/openfe_analysis/rmsd.py | 167 ++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 7caf3db..eb98eec 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -52,7 +52,7 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers - Unwraps protein and ligand atom to be made whole - Shifts protein chains and the ligand to the image closest to the first protein chain (:class:`ClosestImageShift`) - - Aligns the entire system to minimise the protein RMSD (:class:`Aligner`) + - Aligns the entire system to minimize the protein RMSD (:class:`Aligner`) If only a ligand is present: @@ -87,7 +87,7 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers else: # if there's no protein # - make the ligand not jump periodic images between frames - # - align the ligand to minimise its RMSD + # - align the ligand to minimize its RMSD nope = NoJump(ligand) align = Aligner(ligand) @@ -105,46 +105,54 @@ class Protein2DRMSD(AnalysisBase): For all unique frame pairs ``(i, j)`` with ``i < j``, this function computes the RMSD between atomic coordinates after optimal alignment. + Alignment is performed by centering each frame on its center of geometry, + followed by rotational and translational superposition using the QCP method. + + Parameters + ---------- + atomgroup: mda.AtomGroup + Protein atoms (e.g. CA selection) + weights: np.ndarray, optional + Per-atom weights to use in the RMSD calculation. If ``None``, + all atoms are weighted equally. + + .. note:: + + All atom positions are accumulated in memory during the trajectory + iteration. For long trajectories or large systems this may result in + significant memory usage. Consider using the ``step`` argument to + ``run()`` to reduce the number of frames analyzed. """ - def __init__(self, atomgroup, weights=None, **kwargs): - """ - Parameters - ---------- - atomgroup: AtomGroup - Protein atoms (e.g. CA selection) - weights: np.ndarray, optional - Per-atom weights to use in the RMSD calculation. If ``None``, - all atoms are weighted equally. - """ + _analysis_algorithm_is_parallelizable = False + + def __init__(self, atomgroup: mda.AtomGroup, weights: Optional[np.ndarray] = None, **kwargs): super().__init__(atomgroup.universe.trajectory, **kwargs) self._weights = weights self._ag = atomgroup - def _prepare(self): - self._coords = [] - self.results.rmsd2d = [] + def _prepare(self) -> None: + self._coords = np.zeros((self.n_frames, self._ag.n_atoms, 3), dtype=np.float64) - def _single_frame(self): - self._coords.append(self._ag.positions.copy()) + def _single_frame(self) -> None: + self._coords[self._frame_index] = self._ag.positions - def _conclude(self): - positions = np.asarray(self._coords) - nframes = positions.shape[0] + def _conclude(self) -> None: + nframes = self._coords.shape[0] - output = [] - for i, j in itertools.combinations(range(nframes), 2): - posi, posj = positions[i], positions[j] - pair_rmsd = rms.rmsd( + # Pre-allocate numpy arrays + n_pairs = nframes * (nframes - 1) // 2 + self.results.rmsd2d = np.empty(n_pairs) + + for idx, (i, j) in enumerate(itertools.combinations(range(nframes), 2)): + posi, posj = self._coords[i], self._coords[j] + self.results.rmsd2d[idx] = rms.rmsd( posi, posj, self._weights, center=True, superposition=True, ) - output.append(pair_rmsd) - - self.results.rmsd2d = np.asarray(output) class RMSDAnalysis(AnalysisBase): @@ -156,21 +164,29 @@ class RMSDAnalysis(AnalysisBase): atomgroup : MDAnalysis.AtomGroup Atoms to compute RMSD for. reference: Optional[MDAnalysis.AtomGroup] - Reference AtomGroup. If ``None``, the reference positions are captured - from the mobile AtomGroup at the start of the run (i.e. whatever frame - the trajectory is on when ``.run()`` is called). + Reference AtomGroup. If ``None``, the reference positions are taken + from the first analyzed frame, so ``run(start=10)`` measures RMSD + relative to frame 10, not frame 0. mass_weighted : bool, optional If True, compute mass-weighted RMSD. + center : bool, optional + If ``True``, subtract the center of geometry before computing RMSD. + Defaults to ``False`` as the trajectory is assumed to be pre-centered. superposition : bool, optional If ``True``, perform rotational superposition before computing RMSD. + Defaults to ``False`` as the trajectory is assumed to be pre-superposed. + """ + _analysis_algorithm_is_parallelizable = False + def __init__( self, - atomgroup, - reference=None, - mass_weighted=False, - superposition=False, + atomgroup: mda.AtomGroup, + reference: Optional[mda.AtomGroup] = None, + mass_weighted: bool = False, + center: bool = False, + superposition: bool = False, **kwargs, ): super().__init__(atomgroup.universe.trajectory, **kwargs) @@ -178,11 +194,12 @@ def __init__( self._ag = atomgroup self._reference = reference if reference is not None else self._ag self._mass_weighted = mass_weighted + self._center = center self._superposition = superposition - def _prepare(self): - self.results.rmsd = [] - + def _prepare(self) -> None: + self.results.rmsd = np.zeros(self.n_frames, dtype=np.float64) + # reference is taken from the first analyzed frame, not necessarily frame 0 self._reference_pos = self._reference.positions.copy() if self._mass_weighted: @@ -190,44 +207,55 @@ def _prepare(self): else: self._weights = None - def _single_frame(self): - frame_rmsd = rms.rmsd( + def _single_frame(self) -> None: + self.results.rmsd[self._frame_index] = rms.rmsd( self._ag.positions, self._reference_pos, self._weights, - center=False, + center=self._center, superposition=self._superposition, ) - self.results.rmsd.append(frame_rmsd) - - def _conclude(self): - self.results.rmsd = np.asarray(self.results.rmsd) class LigandCOMDrift(AnalysisBase): """ Ligand center-of-mass displacement from initial position. + + Parameters + ---------- + atomgroup : mda.AtomGroup + Ligand atoms for which the center-of-mass drift is calculated. + + .. note:: + The initial position is taken from the first analyzed frame, so + ``run(start=10)`` measures drift relative to frame 10, not frame 0. + + .. note:: + PBC are not applied as the trajectory is assumed to have been + pre-processed, ensuring the ligand does not jump between periodic images. + Passing a box to apply the minimum image convention would give + incorrect results for ligands that have drifted more than half a box + length from their starting position. """ - def __init__(self, atomgroup, **kwargs): + _analysis_algorithm_is_parallelizable = False + + def __init__(self, atomgroup: mda.AtomGroup, **kwargs): super().__init__(atomgroup.universe.trajectory, **kwargs) self._ag = atomgroup - def _prepare(self): - self.results.com_drift = [] + def _prepare(self) -> None: + self.results.com_drift = np.zeros(self.n_frames, dtype=np.float64) + # initial COM is taken from the first analyzed frame, not necessarily frame 0 self._initial_com = self._ag.center_of_mass() - def _single_frame(self): - # distance between start and current ligand position - # ignores PBC, but we've already centered the traj - drift = mda.lib.distances.calc_bonds( + def _single_frame(self) -> None: + # no box argument, assumes the ligand stays in a consistent image; + # applying the minimum image convention could mask large drifts > half a box length + self.results.com_drift[self._frame_index] = mda.lib.distances.calc_bonds( self._ag.center_of_mass(), self._initial_com, ) - self.results.com_drift.append(drift) - - def _conclude(self): - self.results.com_drift = np.asarray(self.results.com_drift) def gather_rms_data( @@ -306,41 +334,14 @@ def gather_rms_data( if prot: prot_rmsd = RMSDAnalysis(prot).run(step=skip) output["protein_RMSD"].append(prot_rmsd.results.rmsd) - # # Using the MDAnalysis RMSD class instead - # gs = ["protein and name CA"] - # prot_rmsd = rms.RMSD( - # u, select="protein and name CA", groupselections=gs, weights="mass") - # prot_rmsd.run(step=skip) - # # The results contain: - # # - frame number - # # - time - # # - RMSD based on select (after superimposing) - # # - RMSD based on groupselections, one array per selection - # output["protein_RMSD"].append(prot_rmsd.results.rmsd.T[3]) prot_rmsd2d = Protein2DRMSD(prot).run(step=skip) output["protein_2D_RMSD"].append(prot_rmsd2d.results.rmsd2d) - # # Using the MDAnalysis DistanceMatrix class - # prot_rmsd2d = diffusionmap.DistanceMatrix(u, select="protein and name CA") - # prot_rmsd2d.run(step=skip) - # dist_mat = prot_rmsd2d.results.dist_matrix - # i, j = np.triu_indices_from(dist_mat, k=1) - # flattened = dist_mat[i, j] - # output["protein_2D_RMSD"].append(flattened) if ligand: lig_rmsd = RMSDAnalysis(ligand, mass_weighted=True).run(step=skip) output["ligand_RMSD"].append(lig_rmsd.results.rmsd) - # # Using the MDAnalysis RMSD class instead - # groupselections = ["resname UNK"] - # lig_rmsd = rms.RMSD( - # u, - # select="protein and name CA", - # groupselections=groupselections, - # weights="mass", - # ) - # lig_rmsd.run(step=skip) - # output["ligand_RMSD"].append(lig_rmsd.results.rmsd.T[3]) + lig_com_drift = LigandCOMDrift(ligand).run(step=skip) output["ligand_wander"].append(lig_com_drift.results.com_drift) From f22d96faf1abe7c421401882e006c045b2da9743 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Fri, 22 May 2026 11:17:24 +0200 Subject: [PATCH 12/14] Fix docs build --- src/openfe_analysis/rmsd.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index eb98eec..3870bb1 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -116,12 +116,12 @@ class Protein2DRMSD(AnalysisBase): Per-atom weights to use in the RMSD calculation. If ``None``, all atoms are weighted equally. - .. note:: - - All atom positions are accumulated in memory during the trajectory - iteration. For long trajectories or large systems this may result in - significant memory usage. Consider using the ``step`` argument to - ``run()`` to reduce the number of frames analyzed. + Notes + ----- + All atom positions are accumulated in memory during the trajectory + iteration. For long trajectories or large systems this may result in + significant memory usage. Consider using the ``step`` argument to + ``run()`` to reduce the number of frames analyzed. """ _analysis_algorithm_is_parallelizable = False @@ -175,7 +175,6 @@ class RMSDAnalysis(AnalysisBase): superposition : bool, optional If ``True``, perform rotational superposition before computing RMSD. Defaults to ``False`` as the trajectory is assumed to be pre-superposed. - """ _analysis_algorithm_is_parallelizable = False @@ -226,16 +225,16 @@ class LigandCOMDrift(AnalysisBase): atomgroup : mda.AtomGroup Ligand atoms for which the center-of-mass drift is calculated. - .. note:: - The initial position is taken from the first analyzed frame, so - ``run(start=10)`` measures drift relative to frame 10, not frame 0. - - .. note:: - PBC are not applied as the trajectory is assumed to have been - pre-processed, ensuring the ligand does not jump between periodic images. - Passing a box to apply the minimum image convention would give - incorrect results for ligands that have drifted more than half a box - length from their starting position. + Notes + ----- + The initial position is taken from the first analyzed frame, so + ``run(start=10)`` measures drift relative to frame 10, not frame 0. + + PBC are not applied as the trajectory is assumed to have been + pre-processed, ensuring the ligand does not jump between periodic images. + Passing a box to apply the minimum image convention would give + incorrect results for ligands that have drifted more than half a box + length from their starting position. """ _analysis_algorithm_is_parallelizable = False From fdf4531af9378b479f9c058f265124290946919a Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Fri, 22 May 2026 12:10:16 +0200 Subject: [PATCH 13/14] Add MDAnalysisTests as test dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 4381d5b..c93e5c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ test = [ "pytest-xdist", "pytest-cov", "pooch", + "MDAnalysisTests", ] [project.urls] From 3c001a1e20a104dcb59beb515b7f3f5a9908b3c6 Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Tue, 26 May 2026 09:49:06 +0200 Subject: [PATCH 14/14] Add news entry --- news/refactor_analyses_base.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 news/refactor_analyses_base.rst diff --git a/news/refactor_analyses_base.rst b/news/refactor_analyses_base.rst new file mode 100644 index 0000000..7bde8cf --- /dev/null +++ b/news/refactor_analyses_base.rst @@ -0,0 +1,23 @@ +**Added:** + +* + +**Changed:** + +* Refactored the structural analyses methods into MDAnalysis AnalysisBase classes. + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +*