Source code for rush.prepare._protein

"""
Protein preparation module for the Rush Python client.

This module supports system preparation workflows such as converting PDB inputs
to TRC, protonating and optimizing hydrogen positions, and augmenting
structures with connectivity and formal charge information before downstream
calculations.

Usage::

    from rush import prepare

    result = prepare.protein("protein.pdb").fetch()
    print(result.topology.symbols)
"""

import json
import sys
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path
from string import Template
from typing import Any, Literal

from gql.transport.exceptions import TransportQueryError

from rush import Chains, Residues, Topology

from .._trc import TRCPaths, TRCRef, to_chains_vobj, to_residues_vobj, to_topology_vobj
from .._utils import optional_str
from ..client import (
    RunOpts,
    RunSpec,
    RushObject,
    _get_project_id,
    _submit_rex,
)
from ..convert import _single_trc, from_json, from_pdb
from ..mol import TRC
from ..run import RushRun

# ---------------------------------------------------------------------------
# Result types
# ---------------------------------------------------------------------------


[docs] @dataclass(frozen=True) class ResultRef: """Lightweight reference to prepare-protein output in the Rush object store. May contain multiple TRC triplets if the input PDB has multiple models. """ models: list[TRCRef] def __getitem__(self, index: int) -> TRCRef: return self.models[index] def __len__(self) -> int: return len(self.models) def __iter__(self) -> Iterator[TRCRef]: return iter(self.models)
[docs] @classmethod def from_raw_output(cls, res: Any) -> "ResultRef": """Parse raw ``collect_run`` output into a ``ResultRef``. The raw output is a list of groups, where each group is a list of 3 dicts (topology, residues, chains objects). Multi-model PDBs produce multiple groups. """ if not isinstance(res, list) or len(res) == 0: raise ValueError( f"prepare_protein should return a non-empty list, " f"got {type(res).__name__}" f"{f' with {len(res)} items' if hasattr(res, '__len__') else ''}." ) models: list[TRCRef] = [] for i, group in enumerate(res): if not isinstance(group, list) or len(group) != 3: raise ValueError( f"prepare_protein output group {i} expected a list of 3 elements, " f"got {type(group).__name__}" f"{f' with {len(group)} items' if isinstance(group, list) else ''}." ) topo, resid, chain = group[0], group[1], group[2] if ( not isinstance(topo, dict) or not isinstance(resid, dict) or not isinstance(chain, dict) ): raise ValueError( f"prepare_protein output group {i} elements must be dicts." ) models.append( TRCRef( topology=RushObject.from_dict(topo), residues=RushObject.from_dict(resid), chains=RushObject.from_dict(chain), ) ) return cls(models=models)
[docs] def fetch(self) -> list[TRC]: """Download prepare-protein output and parse into TRCs. Returns one TRC per model in the input PDB. Most PDBs contain a single model, so ``result[0]`` is the common pattern. """ return [model.fetch() for model in self.models]
[docs] def save(self) -> list[TRCPaths]: """Download prepare-protein output and save to the workspace. Returns one TRCPaths per model in the input PDB. """ return [model.save() for model in self.models]
# --------------------------------------------------------------------------- # Submission # ---------------------------------------------------------------------------
[docs] def protein( mol: TRC | TRCRef | tuple[ Path | str | RushObject | Topology, Path | str | RushObject | Residues, Path | str | RushObject | Chains, ] | Path | str, ph: float | None = None, naming_scheme: Literal["AMBER", "CHARMM"] | None = None, capping_style: Literal["never", "truncated", "always"] | None = None, truncation_threshold: int | None = None, opt: bool | None = None, debump: bool | None = None, run_spec: RunSpec = RunSpec(gpus=1), run_opts: RunOpts = RunOpts(), ) -> RushRun[ResultRef]: """ Submit a prepare-protein job for a PDB or TRC file. Returns a :class:`~rush.run.RushRun` handle. Call ``.fetch()`` to get the parsed TRC, or ``.save()`` to write the output files to disk. """ # Upload inputs match mol: case TRC(): trc_ref = TRCRef.upload(mol) case TRCRef(): trc_ref = mol case (t, r, c): trc_ref = TRCRef( RushObject.from_dict(to_topology_vobj(t)), RushObject.from_dict(to_residues_vobj(r)), RushObject.from_dict(to_chains_vobj(c)), ) case Path() | str(): input_path = mol if isinstance(input_path, str): input_path = Path(input_path) with open(input_path) as f: if input_path.suffix == ".pdb": trc = from_pdb(f.read()) else: trc = from_json(json.load(f)) trc = _single_trc(trc, input_path) trc_ref = TRCRef.upload(trc) # Run rex rex = Template("""let obj_j = λ j → VirtualObject { path = j, format = ObjectFormat::json, size = 0 }, prepare_protein = λ topology residues chains → prepare_protein_rex_s ($run_spec) (prepare_protein_rex::PrepareProteinOptions { ph = $ph, naming_scheme = $naming_scheme, capping_style = $capping_style, truncation_threshold = $truncation_threshold, opt = $opt, debump = $debump, }) [( (obj_j topology), (obj_j residues), (obj_j chains) )] in prepare_protein "$topology_vobj_path" "$residues_vobj_path" "$chains_vobj_path" """).substitute( run_spec=run_spec._to_rex(), ph=optional_str(ph), naming_scheme=optional_str( naming_scheme.title() if naming_scheme is not None else None, prefix="prepare_protein_rex::NamingScheme::", ), capping_style=optional_str( capping_style.title() if capping_style is not None else None, prefix="prepare_protein_rex::CappingStyle::", ), truncation_threshold=optional_str(truncation_threshold), opt=optional_str(opt), debump=optional_str(debump), topology_vobj_path=trc_ref.topology.path, residues_vobj_path=trc_ref.residues.path, chains_vobj_path=trc_ref.chains.path, ) try: return RushRun( _submit_rex(_get_project_id(), rex, run_opts), ResultRef, ) except TransportQueryError as e: if e.errors: for error in e.errors: print(f"Error: {error['message']}", file=sys.stderr) raise