Skip to content

Sequence Module

Core sequence handling and manipulation.

Overview

The sequence module provides the Sequence class for representing individual DNA/protein sequences.

Classes

pypopart.core.sequence

DNA sequence representation and manipulation for PyPopART.

Sequence

Represents a DNA sequence with metadata.

Supports IUPAC nucleotide codes including ambiguous characters.

Source code in src/pypopart/core/sequence.py
class Sequence:
    """
    Represents a DNA sequence with metadata.

    Supports IUPAC nucleotide codes including ambiguous characters.
    """

    # IUPAC nucleotide codes (uppercase)
    VALID_CHARS = set('ACGTRYSWKMBDHVN-?')

    # Reverse complement mapping
    COMPLEMENT = {
        'A': 'T',
        'T': 'A',
        'C': 'G',
        'G': 'C',
        'R': 'Y',
        'Y': 'R',
        'S': 'S',
        'W': 'W',
        'K': 'M',
        'M': 'K',
        'B': 'V',
        'V': 'B',
        'D': 'H',
        'H': 'D',
        'N': 'N',
        '-': '-',
        '?': '?',
    }

    def __init__(
        self,
        id: str,
        data: str,
        metadata: Optional[Dict[str, Any]] = None,
        description: Optional[str] = None,
    ):
        """
        Initialize a sequence.

        Parameters
        ----------
        id :
            Sequence identifier.
        data :
            DNA sequence string.
        metadata :
            Optional metadata dictionary.
        description :
            Optional sequence description.

        Raises :
        ValueError :
            If sequence contains invalid characters.
        """
        self.id = id
        self.data = data.strip().upper()
        self.metadata = metadata if metadata is not None else {}
        self.description = description

        # Validate sequence
        self._validate()

    def _validate(self) -> None:
        """
        Validate sequence data contains only valid IUPAC characters.

        Raises
        ------
            ValueError: If invalid characters found
        """
        invalid = set(self.data) - self.VALID_CHARS
        if invalid:
            raise ValueError(f'Invalid characters in sequence: {sorted(invalid)}')

    def __len__(self) -> int:
        """Return sequence length."""
        return len(self.data)

    def __eq__(self, other: object) -> bool:
        """Check equality based on sequence data."""
        if not isinstance(other, Sequence):
            return False
        return self.data == other.data

    def __hash__(self) -> int:
        """Hash based on sequence data for use in sets/dicts."""
        return hash(self.data)

    def __str__(self) -> str:
        """Return FASTA format string representation."""
        header = f'>{self.id}'
        if self.description:
            header += f' {self.description}'
        return f'{header}\n{self.data}'

    def __repr__(self) -> str:
        """Return detailed representation."""
        return (
            f"Sequence(id='{self.id}', length={len(self)}, data='{self.data[:20]}...')"
        )

    def reverse_complement(self) -> 'Sequence':
        """
        Calculate reverse complement of sequence.

        Returns
        -------
            New Sequence object with reverse complement.
        """
        rev_comp_data = ''.join(self.COMPLEMENT[base] for base in reversed(self.data))
        return Sequence(
            id=f'{self.id}_rev_comp',
            data=rev_comp_data,
            metadata=self.metadata.copy(),
            description=f'Reverse complement of {self.id}',
        )

    def gc_content(self) -> float:
        """
        Calculate GC content as fraction (0.0-1.0).

        Ignores gaps (-) and ambiguous characters (N, ?).

        Returns
        -------
            GC content fraction.
        """
        # Count G, C, and S (which represents G or C)
        gc_count = sum(1 for base in self.data if base in 'GCS')
        # Count only ACGT bases (exclude gaps, ambiguous chars)
        total_count = sum(1 for base in self.data if base in 'ACGT')

        if total_count == 0:
            return 0.0
        return gc_count / total_count

    def count_gaps(self) -> int:
        """
        Count number of gap characters.

        Returns
        -------
            Number of gaps.
        """
        return self.data.count('-')

    def count_ambiguous(self) -> int:
        """
        Count number of ambiguous characters (N and ?).

        Returns
        -------
            Number of ambiguous characters.
        """
        return sum(1 for base in self.data if base in 'N?')

    def remove_gaps(self) -> 'Sequence':
        """
        Create new sequence with gaps removed.

        Returns
        -------
            New Sequence object without gaps.
        """
        ungapped_data = self.data.replace('-', '')
        return Sequence(
            id=self.id,
            data=ungapped_data,
            metadata=self.metadata.copy(),
            description=self.description,
        )

    def slice(self, start: int, end: Optional[int] = None) -> 'Sequence':
        """
            Extract a slice of the sequence.

        Parameters
        ----------
            start :
                Start position (0-based, inclusive).
            end :
                End position (0-based, exclusive), None for end of sequence.

        Returns
        -------
            New Sequence object with sliced data.
        """
        sliced_data = self.data[start:end]
        slice_desc = f'slice[{start}:{end if end else "end"}]'
        return Sequence(
            id=f'{self.id}_{slice_desc}',
            data=sliced_data,
            metadata=self.metadata.copy(),
            description=f'Slice {slice_desc} of {self.id}',
        )

    def to_dict(self) -> Dict[str, Any]:
        """
        Convert sequence to dictionary representation.

        Returns
        -------
            Dictionary with sequence data and statistics.
        """
        return {
            'id': self.id,
            'data': self.data,
            'metadata': self.metadata,
            'description': self.description,
            'length': len(self),
            'gc_content': self.gc_content(),
            'gaps': self.count_gaps(),
            'ambiguous': self.count_ambiguous(),
        }

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'Sequence':
        """
            Create sequence from dictionary.

        Parameters
        ----------
            data :
                Dictionary with sequence information.

        Returns
        -------
            New Sequence object.
        """
        return cls(
            id=data['id'],
            data=data['data'],
            metadata=data.get('metadata', {}),
            description=data.get('description'),
        )
__init__
__init__(
    id: str,
    data: str,
    metadata: Optional[Dict[str, Any]] = None,
    description: Optional[str] = None,
)

Initialize a sequence.

Parameters:

Name Type Description Default
id str

Sequence identifier.

required
data str

DNA sequence string.

required
metadata Optional[Dict[str, Any]]

Optional metadata dictionary.

None
description Optional[str]

Optional sequence description.

None
Raises
required
ValueError

If sequence contains invalid characters.

required
Source code in src/pypopart/core/sequence.py
def __init__(
    self,
    id: str,
    data: str,
    metadata: Optional[Dict[str, Any]] = None,
    description: Optional[str] = None,
):
    """
    Initialize a sequence.

    Parameters
    ----------
    id :
        Sequence identifier.
    data :
        DNA sequence string.
    metadata :
        Optional metadata dictionary.
    description :
        Optional sequence description.

    Raises :
    ValueError :
        If sequence contains invalid characters.
    """
    self.id = id
    self.data = data.strip().upper()
    self.metadata = metadata if metadata is not None else {}
    self.description = description

    # Validate sequence
    self._validate()
__len__
__len__() -> int

Return sequence length.

Source code in src/pypopart/core/sequence.py
def __len__(self) -> int:
    """Return sequence length."""
    return len(self.data)
__eq__
__eq__(other: object) -> bool

Check equality based on sequence data.

Source code in src/pypopart/core/sequence.py
def __eq__(self, other: object) -> bool:
    """Check equality based on sequence data."""
    if not isinstance(other, Sequence):
        return False
    return self.data == other.data
__hash__
__hash__() -> int

Hash based on sequence data for use in sets/dicts.

Source code in src/pypopart/core/sequence.py
def __hash__(self) -> int:
    """Hash based on sequence data for use in sets/dicts."""
    return hash(self.data)
__str__
__str__() -> str

Return FASTA format string representation.

Source code in src/pypopart/core/sequence.py
def __str__(self) -> str:
    """Return FASTA format string representation."""
    header = f'>{self.id}'
    if self.description:
        header += f' {self.description}'
    return f'{header}\n{self.data}'
__repr__
__repr__() -> str

Return detailed representation.

Source code in src/pypopart/core/sequence.py
def __repr__(self) -> str:
    """Return detailed representation."""
    return (
        f"Sequence(id='{self.id}', length={len(self)}, data='{self.data[:20]}...')"
    )
reverse_complement
reverse_complement() -> Sequence

Calculate reverse complement of sequence.

Returns:

Type Description
New Sequence object with reverse complement.
Source code in src/pypopart/core/sequence.py
def reverse_complement(self) -> 'Sequence':
    """
    Calculate reverse complement of sequence.

    Returns
    -------
        New Sequence object with reverse complement.
    """
    rev_comp_data = ''.join(self.COMPLEMENT[base] for base in reversed(self.data))
    return Sequence(
        id=f'{self.id}_rev_comp',
        data=rev_comp_data,
        metadata=self.metadata.copy(),
        description=f'Reverse complement of {self.id}',
    )
gc_content
gc_content() -> float

Calculate GC content as fraction (0.0-1.0).

Ignores gaps (-) and ambiguous characters (N, ?).

Returns:

Type Description
GC content fraction.
Source code in src/pypopart/core/sequence.py
def gc_content(self) -> float:
    """
    Calculate GC content as fraction (0.0-1.0).

    Ignores gaps (-) and ambiguous characters (N, ?).

    Returns
    -------
        GC content fraction.
    """
    # Count G, C, and S (which represents G or C)
    gc_count = sum(1 for base in self.data if base in 'GCS')
    # Count only ACGT bases (exclude gaps, ambiguous chars)
    total_count = sum(1 for base in self.data if base in 'ACGT')

    if total_count == 0:
        return 0.0
    return gc_count / total_count
count_gaps
count_gaps() -> int

Count number of gap characters.

Returns:

Type Description
Number of gaps.
Source code in src/pypopart/core/sequence.py
def count_gaps(self) -> int:
    """
    Count number of gap characters.

    Returns
    -------
        Number of gaps.
    """
    return self.data.count('-')
count_ambiguous
count_ambiguous() -> int

Count number of ambiguous characters (N and ?).

Returns:

Type Description
Number of ambiguous characters.
Source code in src/pypopart/core/sequence.py
def count_ambiguous(self) -> int:
    """
    Count number of ambiguous characters (N and ?).

    Returns
    -------
        Number of ambiguous characters.
    """
    return sum(1 for base in self.data if base in 'N?')
remove_gaps
remove_gaps() -> Sequence

Create new sequence with gaps removed.

Returns:

Type Description
New Sequence object without gaps.
Source code in src/pypopart/core/sequence.py
def remove_gaps(self) -> 'Sequence':
    """
    Create new sequence with gaps removed.

    Returns
    -------
        New Sequence object without gaps.
    """
    ungapped_data = self.data.replace('-', '')
    return Sequence(
        id=self.id,
        data=ungapped_data,
        metadata=self.metadata.copy(),
        description=self.description,
    )
slice
slice(start: int, end: Optional[int] = None) -> Sequence
Extract a slice of the sequence.

Returns:

Type Description
New Sequence object with sliced data.
Source code in src/pypopart/core/sequence.py
def slice(self, start: int, end: Optional[int] = None) -> 'Sequence':
    """
        Extract a slice of the sequence.

    Parameters
    ----------
        start :
            Start position (0-based, inclusive).
        end :
            End position (0-based, exclusive), None for end of sequence.

    Returns
    -------
        New Sequence object with sliced data.
    """
    sliced_data = self.data[start:end]
    slice_desc = f'slice[{start}:{end if end else "end"}]'
    return Sequence(
        id=f'{self.id}_{slice_desc}',
        data=sliced_data,
        metadata=self.metadata.copy(),
        description=f'Slice {slice_desc} of {self.id}',
    )
to_dict
to_dict() -> Dict[str, Any]

Convert sequence to dictionary representation.

Returns:

Type Description
Dictionary with sequence data and statistics.
Source code in src/pypopart/core/sequence.py
def to_dict(self) -> Dict[str, Any]:
    """
    Convert sequence to dictionary representation.

    Returns
    -------
        Dictionary with sequence data and statistics.
    """
    return {
        'id': self.id,
        'data': self.data,
        'metadata': self.metadata,
        'description': self.description,
        'length': len(self),
        'gc_content': self.gc_content(),
        'gaps': self.count_gaps(),
        'ambiguous': self.count_ambiguous(),
    }
from_dict classmethod
from_dict(data: Dict[str, Any]) -> Sequence
Create sequence from dictionary.

Returns:

Type Description
New Sequence object.
Source code in src/pypopart/core/sequence.py
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'Sequence':
    """
        Create sequence from dictionary.

    Parameters
    ----------
        data :
            Dictionary with sequence information.

    Returns
    -------
        New Sequence object.
    """
    return cls(
        id=data['id'],
        data=data['data'],
        metadata=data.get('metadata', {}),
        description=data.get('description'),
    )