Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 158 additions & 11 deletions openms_python/py_aasequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

from __future__ import annotations

from typing import Optional
from typing import Optional, Literal
import pyopenms as oms


class Py_AASequence:
"""
A Pythonic wrapper around pyOpenMS AASequence.
A Pythonic, immutable wrapper around pyOpenMS AASequence.

This class provides intuitive properties and methods for working with
amino acid sequences, including common operations like reversing and
Expand Down Expand Up @@ -40,7 +40,7 @@ def __init__(self, native_sequence: Optional[oms.AASequence] = None):
@classmethod
def from_string(cls, sequence_str: str) -> Py_AASequence:
"""
Create AASequence from string representation.
Create Py_AASequence from string representation.

Args:
sequence_str: String representation of the amino acid sequence.
Expand All @@ -57,6 +57,20 @@ def from_string(cls, sequence_str: str) -> Py_AASequence:

# ==================== Pythonic Properties ====================

@classmethod
def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence:
"""
Creates Py_AASequence from native pyOpenMS AASequence.

Args:
native_sequence (oms.AASequence):

Returns:
Py_AASequence: New wrapped opject

"""
return cls(native_sequence)
Comment on lines +61 to +73
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix typos and complete docstring in from_native.

The docstring has two issues:

  • Line 66: Args description is incomplete—no text follows native_sequence (oms.AASequence):.
  • Line 69: Typo "opject" should be "object".

Apply this diff:

     @classmethod
     def from_native(cls, native_sequence: oms.AASequence) -> Py_AASequence:
         """
         Creates Py_AASequence from native pyOpenMS AASequence.
 
         Args:
-            native_sequence (oms.AASequence): 
+            native_sequence (oms.AASequence): Native pyOpenMS AASequence object to wrap.
 
         Returns:
-            Py_AASequence: New wrapped opject
+            Py_AASequence: New wrapped object.
 
         """
         return cls(native_sequence)
🤖 Prompt for AI Agents
In openms_python/py_aasequence.py around lines 60 to 72, the from_native
docstring is incomplete and has a typo: fill in the Args description for
native_sequence to explain that it is a pyOpenMS AASequence to be wrapped (e.g.,
"native_sequence (oms.AASequence): the native pyOpenMS amino acid sequence to
wrap"), correct "opject" to "object" in the Returns section, and optionally
tighten wording to "New wrapped object" or "Wrapped Py_AASequence instance" so
the docstring reads clearly and completely.


@property
def native(self) -> oms.AASequence:
"""Return the underlying pyOpenMS AASequence."""
Expand Down Expand Up @@ -204,26 +218,126 @@ def __eq__(self, other: object) -> bool:
return False
return self.sequence == other.sequence

def __getitem__(self, index: int) -> str:
def __getitem__(self, index):
"""
Get residue at position.
Get residue(s) at position(s).

Supports both single indexing and slicing, returning Py_AASequence objects.

Args:
index: Position in the sequence (0-based).
index: Integer for single residue, or slice object for subsequence.

Returns:
str: Single letter amino acid code.
Py_AASequence: Wrapped residue or subsequence.

Example:
>>> seq = Py_AASequence.from_string("PEPTIDE")
>>> seq[1] # Returns Py_AASequence("E")
>>> seq[1:4] # Returns Py_AASequence("EPT")
>>> seq[-1] # Returns Py_AASequence("E")
"""
if index < 0 or index >= len(self):
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
residue = self._sequence.getResidue(index)
return residue.getOneLetterCode()
if isinstance(index, slice):
start, stop, step = index.indices(len(self))
if step != 1:
raise ValueError("Step slicing is not supported for amino acid sequences")
return Py_AASequence.from_native(self._sequence.getSubsequence(start, stop - start))
else:
# Handle negative indices
if index < 0:
index = len(self) + index
if index >= len(self):
raise IndexError(f"Index {index} out of range for sequence of length {len(self)}")
residue = self._sequence.getResidue(index)
residue_char = residue.getOneLetterCode()
return Py_AASequence.from_string(residue_char)

def __iter__(self):
"""Iterate over residues."""
for i in range(len(self)):
yield self[i]
def __add__(self, other: Py_AASequence | str) -> Py_AASequence:
"""
Concatenate sequences.

Args:
other: Py_AASequence or string to append.

Returns:
Py_AASequence: New concatenated sequence.

Example:
>>> seq1 = Py_AASequence.from_string("PEP")
>>> seq2 = Py_AASequence.from_string("TIDE")
>>> combined = seq1 + seq2
>>> print(combined.sequence)
PEPTIDE
>>> combined2 = seq1 + "TIDE"
>>> print(combined2.sequence)
PEPTIDE
"""
if isinstance(other, Py_AASequence):
combined_str = self.sequence + other.sequence
elif isinstance(other, str):
combined_str = self.sequence + other
else:
return NotImplemented
return Py_AASequence.from_string(combined_str)

def __radd__(self, other: str) -> Py_AASequence:
"""
Support string + Py_AASequence.

Example:
>>> seq = Py_AASequence.from_string("TIDE")
>>> combined = "PEP" + seq
>>> print(combined.sequence)
PEPTIDE
"""
if isinstance(other, str):
combined_str = other + self.sequence
return Py_AASequence.from_string(combined_str)
return NotImplemented

def __mul__(self, times: int) -> Py_AASequence:
"""
Repeat sequence.

Args:
times: Number of times to repeat (must be >= 0).

Returns:
Py_AASequence: New repeated sequence.

Example:
>>> seq = Py_AASequence.from_string("PEP")
>>> repeated = seq * 3
>>> print(repeated.sequence)
PEPPEPPEP
"""
if not isinstance(times, int) or times < 0:
return NotImplemented
return Py_AASequence.from_string(self.sequence * times)

def __rmul__(self, times: int) -> Py_AASequence:
"""Support int * Py_AASequence."""
return self.__mul__(times)
def __contains__(self, substring: str) -> bool:
"""Check if substring is in sequence."""
return self.has_substring(substring)

def __hash__(self) -> int:
"""Make sequences hashable for use in sets/dicts."""
return hash(self.sequence)

def __lt__(self, other: Py_AASequence) -> bool:
"""Lexicographic comparison by sequence."""
if not isinstance(other, Py_AASequence):
return NotImplemented
return self.sequence < other.sequence
def count(self, residue: str) -> int:
"""Count occurrences of a residue, to be consistent with str.count()."""
return self._sequence.count(residue)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

find . -name "py_aasequence.py" -type f

Repository: OpenMS/openms-python

Length of output: 96


🏁 Script executed:

rg -A 10 "def has_substring" --type py

Repository: OpenMS/openms-python

Length of output: 751


🏁 Script executed:

rg "def _sequence|self\._sequence\s*=" openms_python/py_aasequence.py -B 2 -A 2

Repository: OpenMS/openms-python

Length of output: 297


🏁 Script executed:

rg "def count" openms_python/py_aasequence.py -B 2 -A 5

Repository: OpenMS/openms-python

Length of output: 386


🏁 Script executed:

find . -path "*/test*" -name "*.py" -type f | head -10 | xargs rg "__contains__|\.count\(" 2>/dev/null | head -20

Repository: OpenMS/openms-python

Length of output: 46


🌐 Web query:

OpenMS AASequence::count String parameter documentation

💡 Result:

Do you mean AASequence::count (which I can't find in the AASequence API) or the String::count / substring-count behavior used with AASequence (e.g., counting residues/substrings)? Please clarify which exact method/signature you need (or paste the header/signature).


🏁 Script executed:

find . -type f -name "*test*.py" -o -name "test_*.py" | xargs grep -l "AASequence\|count\|__contains__" 2>/dev/null | head -5

Repository: OpenMS/openms-python

Length of output: 228


🏁 Script executed:

grep -r "def count" . --include="*.py" | grep -v ".pyc" | head -20

Repository: OpenMS/openms-python

Length of output: 138


🏁 Script executed:

rg "oms\.AASequence" openms_python/py_aasequence.py -B 1 -A 1 | head -30

Repository: OpenMS/openms-python

Length of output: 848


🏁 Script executed:

cat openms_python/py_aasequence.py | head -50

Repository: OpenMS/openms-python

Length of output: 1684


🏁 Script executed:

cat ./tests/test_py_aasequence.py

Repository: OpenMS/openms-python

Length of output: 7563


🏁 Script executed:

rg "test.*count|test.*contains|__contains__|\.count\(" ./tests/test_py_aasequence.py -B 2 -A 5

Repository: OpenMS/openms-python

Length of output: 46


🏁 Script executed:

rg "def __contains__" openms_python/py_aasequence.py -B 2 -A 5

Repository: OpenMS/openms-python

Length of output: 384


🏁 Script executed:

python3 << 'EOF'
import sys
try:
    import pyopenms as oms
    seq = oms.AASequence.fromString("PEPTIDE")
    # Check if count method exists
    if hasattr(seq, 'count'):
        print("count method exists on oms.AASequence")
        print(f"count('P') = {seq.count('P')}")
        print(f"count('E') = {seq.count('E')}")
        print(f"count('PE') = {seq.count('PE')}")
    else:
        print("count method does NOT exist on oms.AASequence")
        print(f"Available methods: {[m for m in dir(seq) if not m.startswith('_')]}")
except Exception as e:
    print(f"Error: {e}")
    print(f"Exception type: {type(e).__name__}")
EOF

Repository: OpenMS/openms-python

Length of output: 150


🏁 Script executed:

python3 << 'EOF'
import sys
try:
    from openms_python.py_aasequence import Py_AASequence
    seq = Py_AASequence.from_string("PEPTIDE")
    
    # Test __contains__ with non-str
    try:
        result = 123 in seq
        print(f"Non-str in sequence returned: {result}")
    except Exception as e:
        print(f"Non-str in sequence raised: {type(e).__name__}: {e}")
    
    # Test __contains__ with str
    try:
        result = "PEP" in seq
        print(f"Str 'PEP' in sequence returned: {result}")
    except Exception as e:
        print(f"Str in sequence raised: {type(e).__name__}: {e}")
        
except Exception as e:
    print(f"Setup error: {e}")
EOF

Repository: OpenMS/openms-python

Length of output: 150


Tighten type expectations for __contains__ and document count() behavior

Two small points here:

  1. Type handling in __contains__

    def __contains__(self, substring: str) -> bool:
        return self.has_substring(substring)

    At runtime, __contains__ will be called with whatever the caller uses. Non-str inputs will flow through to AASequence.fromString inside has_substring, raising an unclear error.

    A light guard improves robustness:

    def __contains__(self, substring: str) -> bool:
  • """Check if substring is in sequence."""
  • return self.has_substring(substring)
  • """Check if a string substring is in sequence."""
  • if not isinstance(substring, str):
  •    return False
    
  • return self.has_substring(substring)

2. **Undocumented behavior of `count()`**  
```python
def count(self, residue: str) -> int:
    return self._sequence.count(residue)

No tests exist for this method. The docstring claims it mirrors str.count(), but it's unclear whether this:

  • counts single residues (characters),
  • counts subsequences,
  • or handles modifications differently.

Add a docstring clarifying the exact behavior and include tests with simple sequences to demonstrate expected results (e.g., count("P") and count("PE") on "PEPTIDE").

🤖 Prompt for AI Agents
In openms_python/py_aasequence.py around lines 324 to 339, __contains__
currently assumes the argument is a str and forwards it to has_substring causing
unclear errors for non-str inputs; add a light guard that checks
isinstance(substring, str) and return False (or raise TypeError if you prefer
strictness) when it's not a str, then call has_substring for valid strings. Also
update the count() docstring to explicitly state it mirrors str.count (i.e., it
counts occurrences of the given substring which may be a single residue or a
multi-character subsequence) and add unit tests that assert expected results
(e.g., "PEPTIDE".count("P") == 2 and .count("PE") == 1) to prevent regressions.


# ==================== Additional Utilities ====================

def get_mz(self, charge: int) -> float:
Expand Down Expand Up @@ -277,4 +391,37 @@ def has_suffix(self, suffix: str) -> bool:
bool: True if sequence ends with suffix.
"""
return self._sequence.hasSuffix(oms.AASequence.fromString(suffix))


# ===================== Exporting =======================
def to_string(self, modified=True, mod_format: Literal['default', 'unimod', 'bracket'] = 'default') -> str:
"""
Get string representation of the sequence.

Args:
modified (bool): Whether to include modifications in the string.
mod_format (Optional[Literal['default', 'unimod', 'bracket']]): Format for modifications.
'default' for OpenMS format,
'unimod' for UniMod format,
'bracket' for bracket notation.
Default is 'default'.

Returns:
str: Amino acid sequence as string.

Example:
>>> seq = Py_AASequence.from_string("PEPTIDE")
>>> seq_str = seq.to_string()
"""
if not modified:
return self.unmodified_sequence

else:
if mod_format == 'default':
return self._sequence.toString()
elif mod_format == 'unimod':
return self._sequence.toUniModString()
elif mod_format == 'bracket':
return self._sequence.toBracketString()
else:
raise ValueError(f"Unsupported mod_format: {mod_format}, supported are 'default', 'unimod' and 'bracket'")
45 changes: 38 additions & 7 deletions tests/test_py_aasequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,25 +118,22 @@ def test_py_aasequence_iteration():
seq = Py_AASequence.from_string("PEPTIDE")
residues = list(seq)

assert residues == ["P", "E", "P", "T", "I", "D", "E"]
assert [res.sequence for res in residues] == ["P", "E", "P", "T", "I", "D", "E"]
assert len(residues) == 7


def test_py_aasequence_indexing():
"""Test indexing into sequence."""
seq = Py_AASequence.from_string("PEPTIDE")

assert seq[0] == "P"
assert seq[1] == "E"
assert seq[6] == "E"
assert seq[0].sequence == "P"
assert seq[1].sequence == "E"
assert seq[6].sequence == "E"

# Test out of bounds
with pytest.raises(IndexError):
_ = seq[7]

with pytest.raises(IndexError):
_ = seq[-1]


def test_py_aasequence_string_representation():
"""Test string representations."""
Expand Down Expand Up @@ -258,3 +255,37 @@ def test_py_aasequence_with_native_aasequence():

assert seq.sequence == "PEPTIDE"
assert seq.native is native


def test_py_aasequence_to_string():
"""Test to_string method with different options."""
seq = Py_AASequence.from_string("PEPTIDEM(Oxidation)")

# Default should return modified string in default format
mod_str = seq.to_string()
assert mod_str == "PEPTIDEM(Oxidation)"

# Unmodified should return unmodified sequence
unmod_str = seq.to_string(modified=False)
assert unmod_str == "PEPTIDEM"

# Bracket format
bracket_str = seq.to_string(modified=True, mod_format='bracket')
assert bracket_str == "PEPTIDEM[147]"

# unimod format
unimod_str = seq.to_string(modified=True, mod_format='unimod')
assert unimod_str == "PEPTIDEM(UniMod:35)"

# Invalid format should raise error
with pytest.raises(ValueError):
_ = seq.to_string(modified=True, mod_format='invalid_format')


def test_slicing():
aa_seq = Py_AASequence.from_string('PEPTIDEM(Oxidation)R')
assert aa_seq[0].sequence == 'P'
assert aa_seq[-1].sequence == 'R'
assert aa_seq[1:4].sequence == 'EPT'
assert aa_seq[-2:].sequence == 'M(Oxidation)R'