Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 159 additions & 11 deletions openms_python/py_aasequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from __future__ import annotations

from typing import Optional
from typing import Optional, Literal
import pyopenms as oms
import warnings


class Py_AASequence:
"""
A Pythonic wrapper around pyOpenMS AASequence.
A Pythonic, immutable wrapper around pyOpenMS AASequence.

This class provides intuitive properties and methods for working with
amino acid sequences, including common operations like reversing and
Expand Down Expand Up @@ -40,7 +41,7 @@ def __init__(self, native_sequence: Optional[oms.AASequence] = None):
@classmethod
def from_string(cls, sequence_str: str) -> Py_AASequence:
"""
Create AASequence from string representation.
Create Py_AASequence from string representation.

Args:
sequence_str: String representation of the amino acid sequence.
Expand All @@ -57,6 +58,20 @@ def from_string(cls, sequence_str: str) -> Py_AASequence:

# ==================== Pythonic Properties ====================

@classmethod
def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence:
"""
Creates Py_AASequence from native pyOpenMS AASequence.

Args:
native_sequence (oms.AASequence):

Returns:
Py_AASequence: New wrapped opject

"""
return cls(native_sequence)
Comment on lines +61 to +73
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix typos and complete docstring in from_native.

The docstring has two issues:

  • Line 66: Args description is incomplete—no text follows native_sequence (oms.AASequence):.
  • Line 69: Typo "opject" should be "object".

Apply this diff:

     @classmethod
     def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence:
         """
         Creates Py_AASequence from native pyOpenMS AASequence.
 
         Args:
-            native_sequence (oms.AASequence): 
+            native_sequence (oms.AASequence): Native pyOpenMS AASequence object to wrap.
 
         Returns:
-            Py_AASequence: New wrapped opject
+            Py_AASequence: New wrapped object.
 
         """
         return cls(native_sequence)
🤖 Prompt for AI Agents
In openms_python/py_aasequence.py around lines 60 to 72, the from_native
docstring is incomplete and has a typo: fill in the Args description for
native_sequence to explain that it is a pyOpenMS AASequence to be wrapped (e.g.,
"native_sequence (oms.AASequence): the native pyOpenMS amino acid sequence to
wrap"), correct "opject" to "object" in the Returns section, and optionally
tighten wording to "New wrapped object" or "Wrapped Py_AASequence instance" so
the docstring reads clearly and completely.


@property
def native(self) -> oms.AASequence:
"""Return the underlying pyOpenMS AASequence."""
Expand Down Expand Up @@ -204,26 +219,126 @@ def __eq__(self, other: object) -> bool:
return False
return self.sequence == other.sequence

def __getitem__(self, index: int) -> str:
def __getitem__(self, index):
"""
Get residue at position.
Get residue(s) at position(s).

Supports both single indexing and slicing, returning Py_AASequence objects.

Args:
index: Position in the sequence (0-based).
index: Integer for single residue, or slice object for subsequence.

Returns:
str: Single letter amino acid code.
Py_AASequence: Wrapped residue or subsequence.

Example:
>>> seq = Py_AASequence.from_string("PEPTIDE")
>>> seq[1] # Returns Py_AASequence("E")
>>> seq[1:4] # Returns Py_AASequence("EPT")
>>> seq[-1] # Returns Py_AASequence("E")
"""
if index < 0 or index >= len(self):
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
residue = self._sequence.getResidue(index)
return residue.getOneLetterCode()
if isinstance(index, slice):
start, stop, step = index.indices(len(self))
if step != 1:
raise ValueError("Step slicing is not supported for amino acid sequences")
return Py_AASequence.from_native(self._sequence.getSubsequence(start, stop - start))
else:
# Handle negative indices
if index < 0:
index = len(self) + index
if index >= len(self):
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
residue = self._sequence.getSubsequence(index, 1)
return Py_AASequence.from_native(residue)

def __iter__(self):
"""Iterate over residues."""
for i in range(len(self)):
yield self[i]
def __add__(self, other: Py_AASequence | str) -> Py_AASequence:
"""
Concatenate sequences.

Args:
other: Py_AASequence or string to append.

Returns:
Py_AASequence: New concatenated sequence.

Example:
>>> seq1 = Py_AASequence.from_string("PEP")
>>> seq2 = Py_AASequence.from_string("TIDE")
>>> combined = seq1 + seq2
>>> print(combined.sequence)
PEPTIDE
>>> combined2 = seq1 + "TIDE"
>>> print(combined2.sequence)
PEPTIDE
"""
if isinstance(other, Py_AASequence):
combined_str = self.sequence + other.sequence
elif isinstance(other, str):
combined_str = self.sequence + other
else:
return NotImplemented
return Py_AASequence.from_string(combined_str)

def __radd__(self, other: str) -> Py_AASequence:
"""
Support string + Py_AASequence.

Example:
>>> seq = Py_AASequence.from_string("TIDE")
>>> combined = "PEP" + seq
>>> print(combined.sequence)
PEPTIDE
"""
if isinstance(other, str):
combined_str = other + self.sequence
return Py_AASequence.from_string(combined_str)
return NotImplemented

def __mul__(self, times: int) -> Py_AASequence:
"""
Repeat sequence.

Args:
times: Number of times to repeat (must be >= 0).

Returns:
Py_AASequence: New repeated sequence.

Example:
>>> seq = Py_AASequence.from_string("PEP")
>>> repeated = seq * 3
>>> print(repeated.sequence)
PEPPEPPEP
"""
if not isinstance(times, int) or times < 0:
return NotImplemented
return Py_AASequence.from_string(self.sequence * times)

def __rmul__(self, times: int) -> Py_AASequence:
"""Support int * Py_AASequence."""
return self.__mul__(times)
def __contains__(self, substring: str) -> bool:
"""Check if substring is in sequence."""
return self.has_substring(substring)

def __hash__(self) -> int:
"""Make sequences hashable for use in sets/dicts."""
return hash(self.sequence)

def __lt__(self, other: Py_AASequence) -> bool:
"""Lexicographic comparison by sequence."""
if not isinstance(other, Py_AASequence):
return NotImplemented
return self.sequence < other.sequence
def count(self, residue: str) -> int:
"""Count occurrences of a residue, to be consistent with str.count(), note currently does not account for modifications"""
warnings.warn("count method does not account for modifications")
return self.unmodified_sequence.count(residue)

# ==================== Additional Utilities ====================

def get_mz(self, charge: int) -> float:
Expand Down Expand Up @@ -277,4 +392,37 @@ def has_suffix(self, suffix: str) -> bool:
bool: True if sequence ends with suffix.
"""
return self._sequence.hasSuffix(oms.AASequence.fromString(suffix))


# ===================== Exporting =======================
def to_string(self, modified=True, mod_format: Literal['default', 'unimod', 'bracket'] = 'default') -> str:
"""
Get string representation of the sequence.

Args:
modified (bool): Whether to include modifications in the string.
mod_format (Optional[Literal['default', 'unimod', 'bracket']]): Format for modifications.
'default' for OpenMS format,
'unimod' for UniMod format,
'bracket' for bracket notation.
Default is 'default'.

Returns:
str: Amino acid sequence as string.

Example:
>>> seq = Py_AASequence.from_string("PEPTIDE")
>>> seq_str = seq.to_string()
"""
if not modified:
return self.unmodified_sequence

else:
if mod_format == 'default':
return self._sequence.toString()
elif mod_format == 'unimod':
return self._sequence.toUniModString()
elif mod_format == 'bracket':
return self._sequence.toBracketString()
else:
raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'default', 'unimod' and 'bracket'")
51 changes: 44 additions & 7 deletions tests/test_py_aasequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,25 +118,22 @@ def test_py_aasequence_iteration():
seq = Py_AASequence.from_string("PEPTIDE")
residues = list(seq)

assert residues == ["P", "E", "P", "T", "I", "D", "E"]
assert [res.sequence for res in residues] == ["P", "E", "P", "T", "I", "D", "E"]
assert len(residues) == 7


def test_py_aasequence_indexing():
"""Test indexing into sequence."""
seq = Py_AASequence.from_string("PEPTIDE")

assert seq[0] == "P"
assert seq[1] == "E"
assert seq[6] == "E"
assert seq[0].sequence == "P"
assert seq[1].sequence == "E"
assert seq[6].sequence == "E"

# Test out of bounds
with pytest.raises(IndexError):
_ = seq[7]

with pytest.raises(IndexError):
_ = seq[-1]


def test_py_aasequence_string_representation():
"""Test string representations."""
Expand Down Expand Up @@ -258,3 +255,43 @@ def test_py_aasequence_with_native_aasequence():

assert seq.sequence == "PEPTIDE"
assert seq.native is native


def test_py_aasequence_to_string():
"""Test to_string method with different options."""
seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)")

# Default should return modified string in default format
mod_str = seq.to_string()
assert mod_str == "PEPTIDEM(Oxidation)"

# Unmodified should return unmodified sequence
unmod_str = seq.to_string(modified=False)
assert unmod_str == "PEPTIDEM"

# Bracket format
bracket_str = seq.to_string(modified=True, mod_format='bracket')
assert bracket_str == "PEPTIDEM[147]"

# unimod format
unimod_str = seq.to_string(modified=True, mod_format='unimod')
assert unimod_str == "PEPTIDEM(UniMod:35)"

# Invalid format should raise error
with pytest.raises(ValueError):
_ = seq.to_string(modified=True, mod_format='invalid_format')


def test_slicing():
aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R')
assert aa_seq[0].sequence == 'P'
assert aa_seq[-1].sequence == 'R'
assert aa_seq[1:4].sequence == 'EPT'
assert aa_seq[-2:].sequence == 'M(Oxidation)R'

def test_count():
aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R')
assert aa_seq.count('E') == 2
assert aa_seq.count('P') == 2
assert aa_seq.count('K') == 0