Skip to content

PAF I/O

This module provides classes and helpers for reading, writing, and reordering PAF (Pairwise mApping Format) alignment records.

PafAlignment — Alignment record collection

PafAlignment wraps a list of :class:~rusty_dot.paf_io.PafRecord objects and provides filtering, contig reordering, and sequence-length lookup utilities. It can be passed directly to :class:~rusty_dot.dotplot.DotPlotter — no :class:~rusty_dot.SequenceIndex is required:

from rusty_dot.paf_io import PafAlignment
from rusty_dot.dotplot import DotPlotter

aln = PafAlignment.from_file("alignments.paf")
q_order, t_order = aln.reorder_contigs()

plotter = DotPlotter(aln)
plotter.plot(
    query_names=q_order,
    target_names=t_order,
    output_path="dotplot.png",
    scale_sequences=True,
)

PafRecord dataclass

A single PAF alignment record.

The twelve required PAF columns are represented as typed attributes. Optional SAM-like tags (e.g. tp:A:P, cg:Z:10M) are stored in :attr:tags. If a cg:Z: tag is present, CIGAR-derived alignment statistics are populated automatically.

Parameters:

Name Type Description Default
query_name str

Query sequence name (column 1).

required
query_len int

Query sequence length (column 2).

required
query_start int

Query start position, 0-based (column 3).

required
query_end int

Query end position, exclusive (column 4).

required
strand str

Relative strand: "+" or "-" (column 5).

required
target_name str

Target sequence name (column 6).

required
target_len int

Target sequence length (column 7).

required
target_start int

Target start position, 0-based (column 8).

required
target_end int

Target end position, exclusive (column 9).

required
residue_matches int

Number of residue matches (column 10).

required
alignment_block_len int

Number of bases in the alignment block (column 11).

required
mapping_quality int

Mapping quality (0–255; 255 = missing) (column 12).

required
tags dict[str, Any]

Optional SAM-like tags decoded as {tag_name: value}.

dict()
cigar str or None

CIGAR string from cg:Z: tag, or None if absent.

None
alignment_length int or None

Target-span alignment length derived from CIGAR, or None.

None
n_matches int or None

Count of exact-match bases (= ops) from CIGAR; falls back to residue_matches when only M ops are present.

None
n_mismatches int or None

Count of mismatch bases (X ops) from CIGAR, or None.

None
n_gaps int or None

Total number of gap bases (I + D bases) from CIGAR.

None
n_gap_bases int or None

Same as n_gaps (alias kept for clarity).

None
Source code in rusty_dot/paf_io.py
@dataclass
class PafRecord:
    """A single PAF alignment record.

    The twelve required PAF columns are represented as typed attributes.
    Optional SAM-like tags (e.g. ``tp:A:P``, ``cg:Z:10M``) are stored in
    :attr:`tags`.  If a ``cg:Z:`` tag is present, CIGAR-derived alignment
    statistics are populated automatically.

    Parameters
    ----------
    query_name : str
        Query sequence name (column 1).
    query_len : int
        Query sequence length (column 2).
    query_start : int
        Query start position, 0-based (column 3).
    query_end : int
        Query end position, exclusive (column 4).
    strand : str
        Relative strand: ``"+"`` or ``"-"`` (column 5).
    target_name : str
        Target sequence name (column 6).
    target_len : int
        Target sequence length (column 7).
    target_start : int
        Target start position, 0-based (column 8).
    target_end : int
        Target end position, exclusive (column 9).
    residue_matches : int
        Number of residue matches (column 10).
    alignment_block_len : int
        Number of bases in the alignment block (column 11).
    mapping_quality : int
        Mapping quality (0–255; 255 = missing) (column 12).
    tags : dict[str, Any]
        Optional SAM-like tags decoded as ``{tag_name: value}``.
    cigar : str or None
        CIGAR string from ``cg:Z:`` tag, or ``None`` if absent.
    alignment_length : int or None
        Target-span alignment length derived from CIGAR, or ``None``.
    n_matches : int or None
        Count of exact-match bases (``=`` ops) from CIGAR; falls back to
        ``residue_matches`` when only ``M`` ops are present.
    n_mismatches : int or None
        Count of mismatch bases (``X`` ops) from CIGAR, or ``None``.
    n_gaps : int or None
        Total number of gap bases (``I`` + ``D`` bases) from CIGAR.
    n_gap_bases : int or None
        Same as ``n_gaps`` (alias kept for clarity).
    """

    query_name: str
    query_len: int
    query_start: int
    query_end: int
    strand: str
    target_name: str
    target_len: int
    target_start: int
    target_end: int
    residue_matches: int
    alignment_block_len: int
    mapping_quality: int
    tags: dict[str, Any] = field(default_factory=dict)
    cigar: str | None = None
    alignment_length: int | None = None
    n_matches: int | None = None
    n_mismatches: int | None = None
    n_gaps: int | None = None
    n_gap_bases: int | None = None

    @property
    def query_aligned_len(self) -> int:
        """Return the aligned length on the query sequence.

        Returns
        -------
        int
            ``query_end - query_start``.
        """
        return self.query_end - self.query_start

    @property
    def target_aligned_len(self) -> int:
        """Return the aligned length on the target sequence.

        Returns
        -------
        int
            ``target_end - target_start``.
        """
        return self.target_end - self.target_start

    @classmethod
    def from_line(cls, line: str) -> 'PafRecord':
        """Parse a single PAF text line into a :class:`PafRecord`.

        Parameters
        ----------
        line : str
            A single PAF record line (tab-separated, trailing newline optional).

        Returns
        -------
        PafRecord
            The parsed record.

        Raises
        ------
        ValueError
            If the line has fewer than 12 tab-separated fields.
        """
        fields = line.rstrip('\n').split('\t')
        if len(fields) < 12:
            raise ValueError(
                f'PAF line has {len(fields)} fields; expected at least 12: {line!r}'
            )
        tags: dict[str, Any] = {}
        cigar: str | None = None
        for tag_field in fields[12:]:
            parts = tag_field.split(':', 2)
            if len(parts) == 3:
                tag_name, tag_type, tag_value = parts
                if tag_type == 'i':
                    tags[tag_name] = int(tag_value)
                elif tag_type == 'f':
                    tags[tag_name] = float(tag_value)
                else:
                    tags[tag_name] = tag_value
                if tag_name == 'cg' and tag_type == 'Z':
                    cigar = tag_value

        residue_matches = int(fields[9])
        stats: dict[str, int] = {}
        if cigar is not None:
            stats = _cigar_stats(cigar, residue_matches)

        return cls(
            query_name=fields[0],
            query_len=int(fields[1]),
            query_start=int(fields[2]),
            query_end=int(fields[3]),
            strand=fields[4],
            target_name=fields[5],
            target_len=int(fields[6]),
            target_start=int(fields[7]),
            target_end=int(fields[8]),
            residue_matches=residue_matches,
            alignment_block_len=int(fields[10]),
            mapping_quality=int(fields[11]),
            tags=tags,
            cigar=cigar,
            alignment_length=stats.get('alignment_length'),
            n_matches=stats.get('n_matches'),
            n_mismatches=stats.get('n_mismatches'),
            n_gaps=stats.get('n_gaps'),
            n_gap_bases=stats.get('n_gap_bases'),
        )

    def to_line(self) -> str:
        """Serialise this record back to a PAF-format string (no trailing newline).

        Returns
        -------
        str
            Tab-separated PAF line with the 12 required columns.  Optional
            tags are not included.
        """
        return '\t'.join(
            str(v)
            for v in [
                self.query_name,
                self.query_len,
                self.query_start,
                self.query_end,
                self.strand,
                self.target_name,
                self.target_len,
                self.target_start,
                self.target_end,
                self.residue_matches,
                self.alignment_block_len,
                self.mapping_quality,
            ]
        )

Attributes

query_aligned_len property

Return the aligned length on the query sequence.

Returns:

Type Description
int

query_end - query_start.

target_aligned_len property

Return the aligned length on the target sequence.

Returns:

Type Description
int

target_end - target_start.

Functions

from_line(line) classmethod

Parse a single PAF text line into a :class:PafRecord.

Parameters:

Name Type Description Default
line str

A single PAF record line (tab-separated, trailing newline optional).

required

Returns:

Type Description
PafRecord

The parsed record.

Raises:

Type Description
ValueError

If the line has fewer than 12 tab-separated fields.

Source code in rusty_dot/paf_io.py
@classmethod
def from_line(cls, line: str) -> 'PafRecord':
    """Parse a single PAF text line into a :class:`PafRecord`.

    Parameters
    ----------
    line : str
        A single PAF record line (tab-separated, trailing newline optional).

    Returns
    -------
    PafRecord
        The parsed record.

    Raises
    ------
    ValueError
        If the line has fewer than 12 tab-separated fields.
    """
    fields = line.rstrip('\n').split('\t')
    if len(fields) < 12:
        raise ValueError(
            f'PAF line has {len(fields)} fields; expected at least 12: {line!r}'
        )
    tags: dict[str, Any] = {}
    cigar: str | None = None
    for tag_field in fields[12:]:
        parts = tag_field.split(':', 2)
        if len(parts) == 3:
            tag_name, tag_type, tag_value = parts
            if tag_type == 'i':
                tags[tag_name] = int(tag_value)
            elif tag_type == 'f':
                tags[tag_name] = float(tag_value)
            else:
                tags[tag_name] = tag_value
            if tag_name == 'cg' and tag_type == 'Z':
                cigar = tag_value

    residue_matches = int(fields[9])
    stats: dict[str, int] = {}
    if cigar is not None:
        stats = _cigar_stats(cigar, residue_matches)

    return cls(
        query_name=fields[0],
        query_len=int(fields[1]),
        query_start=int(fields[2]),
        query_end=int(fields[3]),
        strand=fields[4],
        target_name=fields[5],
        target_len=int(fields[6]),
        target_start=int(fields[7]),
        target_end=int(fields[8]),
        residue_matches=residue_matches,
        alignment_block_len=int(fields[10]),
        mapping_quality=int(fields[11]),
        tags=tags,
        cigar=cigar,
        alignment_length=stats.get('alignment_length'),
        n_matches=stats.get('n_matches'),
        n_mismatches=stats.get('n_mismatches'),
        n_gaps=stats.get('n_gaps'),
        n_gap_bases=stats.get('n_gap_bases'),
    )

to_line()

Serialise this record back to a PAF-format string (no trailing newline).

Returns:

Type Description
str

Tab-separated PAF line with the 12 required columns. Optional tags are not included.

Source code in rusty_dot/paf_io.py
def to_line(self) -> str:
    """Serialise this record back to a PAF-format string (no trailing newline).

    Returns
    -------
    str
        Tab-separated PAF line with the 12 required columns.  Optional
        tags are not included.
    """
    return '\t'.join(
        str(v)
        for v in [
            self.query_name,
            self.query_len,
            self.query_start,
            self.query_end,
            self.strand,
            self.target_name,
            self.target_len,
            self.target_start,
            self.target_end,
            self.residue_matches,
            self.alignment_block_len,
            self.mapping_quality,
        ]
    )

PafAlignment

A collection of PAF alignment records with contig-ordering utilities.

Can be constructed from a file path or an iterable of :class:PafRecord objects. Provides :meth:reorder_contigs to sort query and target sequence names so that a subsequent dotplot shows maximum collinearity.

Parameters:

Name Type Description Default
records list of PafRecord

The alignment records.

required

Examples:

Load from a file and reorder contigs:

>>> aln = PafAlignment.from_file("alignments.paf")
>>> q_order, t_order = aln.reorder_contigs(aln.query_names, aln.target_names)
Source code in rusty_dot/paf_io.py
class PafAlignment:
    """A collection of PAF alignment records with contig-ordering utilities.

    Can be constructed from a file path or an iterable of :class:`PafRecord`
    objects.  Provides :meth:`reorder_contigs` to sort query and target
    sequence names so that a subsequent dotplot shows maximum collinearity.

    Parameters
    ----------
    records : list of PafRecord
        The alignment records.

    Examples
    --------
    Load from a file and reorder contigs:

    >>> aln = PafAlignment.from_file("alignments.paf")
    >>> q_order, t_order = aln.reorder_contigs(aln.query_names, aln.target_names)
    """

    def __init__(self, records: list[PafRecord]) -> None:
        self.records: list[PafRecord] = records
        # Custom group assignments.  None means use the default (query_names
        # → 'a', target_names → 'b') which is computed lazily from records.
        self._groups: dict[str, list[str]] | None = None

    # ------------------------------------------------------------------
    # Constructors
    # ------------------------------------------------------------------

    @classmethod
    def from_file(cls, path: str | Path) -> 'PafAlignment':
        """Load records from a PAF file.

        Parameters
        ----------
        path : str or Path
            Path to the PAF file.

        Returns
        -------
        PafAlignment
            New instance with all records loaded.
        """
        return cls(list(parse_paf_file(path)))

    @classmethod
    def from_records(cls, records: Iterable[PafRecord]) -> 'PafAlignment':
        """Construct from an iterable of :class:`PafRecord` objects.

        Parameters
        ----------
        records : iterable of PafRecord
            Source records.

        Returns
        -------
        PafAlignment
            New instance.
        """
        return cls(list(records))

    # ------------------------------------------------------------------
    # Properties
    # ------------------------------------------------------------------

    @property
    def query_names(self) -> list[str]:
        """Return a deduplicated list of query sequence names (insertion order).

        Returns
        -------
        list[str]
            Unique query names in the order first seen.
        """
        seen: dict[str, None] = {}
        for rec in self.records:
            seen[rec.query_name] = None
        return list(seen)

    @property
    def target_names(self) -> list[str]:
        """Return a deduplicated list of target sequence names (insertion order).

        Returns
        -------
        list[str]
            Unique target names in the order first seen.
        """
        seen: dict[str, None] = {}
        for rec in self.records:
            seen[rec.target_name] = None
        return list(seen)

    def sequence_names(self) -> list[str]:
        """Return a deduplicated list of all query and target sequence names.

        The list contains each name at most once, in the order it was first
        encountered (queries before targets within each record).  This method
        makes :class:`PafAlignment` compatible with :class:`~rusty_dot.dotplot.DotPlotter`.

        Returns
        -------
        list[str]
            All unique sequence names across query and target fields.
        """
        seen: dict[str, None] = {}
        for rec in self.records:
            seen[rec.query_name] = None
            seen[rec.target_name] = None
        return list(seen)

    def get_sequence_length(self, name: str) -> int:
        """Return the length of a sequence by name as stored in PAF records.

        Looks up *name* in the ``query_name`` and ``target_name`` fields of
        every record and returns the corresponding ``query_len`` or
        ``target_len``.  This method makes :class:`PafAlignment` compatible
        with :class:`~rusty_dot.dotplot.DotPlotter`.

        Parameters
        ----------
        name : str
            Sequence name to look up.

        Returns
        -------
        int
            Length of the sequence.

        Raises
        ------
        KeyError
            If *name* is not found in any record.
        """
        for rec in self.records:
            if rec.query_name == name:
                return rec.query_len
            if rec.target_name == name:
                return rec.target_len
        raise KeyError(f'Sequence {name!r} not found in PAF records.')

    def __len__(self) -> int:
        """Return the number of records.

        Returns
        -------
        int
            Record count.
        """
        return len(self.records)

    def __repr__(self) -> str:
        """Return a concise string representation.

        Returns
        -------
        str
            ``PafAlignment(records=<n>, queries=<q>, targets=<t>)``.
        """
        return (
            f'PafAlignment(records={len(self.records)}, '
            f'queries={len(self.query_names)}, '
            f'targets={len(self.target_names)})'
        )

    # ------------------------------------------------------------------
    # Filtering
    # ------------------------------------------------------------------

    def filter_by_query(self, names: Iterable[str]) -> 'PafAlignment':
        """Return a new :class:`PafAlignment` containing only the given query names.

        Parameters
        ----------
        names : iterable of str
            Query names to keep.

        Returns
        -------
        PafAlignment
            Filtered alignment.
        """
        keep = set(names)
        return PafAlignment([r for r in self.records if r.query_name in keep])

    def filter_by_target(self, names: Iterable[str]) -> 'PafAlignment':
        """Return a new :class:`PafAlignment` containing only the given target names.

        Parameters
        ----------
        names : iterable of str
            Target names to keep.

        Returns
        -------
        PafAlignment
            Filtered alignment.
        """
        keep = set(names)
        return PafAlignment([r for r in self.records if r.target_name in keep])

    def filter_by_min_length(self, min_length: int) -> 'PafAlignment':
        """Return a new :class:`PafAlignment` keeping only records of sufficient length.

        Filters on the query aligned length (``query_end - query_start``), which
        equals the alignment block span for both merged k-mer runs and PAF
        alignments imported from a file.

        Parameters
        ----------
        min_length : int
            Minimum alignment length (inclusive).  Records with a query aligned
            length strictly less than ``min_length`` are discarded.

        Returns
        -------
        PafAlignment
            Filtered alignment containing only records with
            ``query_aligned_len >= min_length``.
        """
        return PafAlignment(
            [r for r in self.records if r.query_aligned_len >= min_length]
        )

    # ------------------------------------------------------------------
    # Group management
    # ------------------------------------------------------------------

    @property
    def groups(self) -> dict[str, list[str]]:
        """Return the current group assignments.

        If groups have not been set explicitly via :meth:`set_groups` or
        :meth:`rename_group`, returns the default: all query sequence names
        in group ``'a'`` and all target sequence names in group ``'b'``.

        Returns
        -------
        dict[str, list[str]]
            Mapping of group label → list of sequence names.
        """
        if self._groups is not None:
            return dict(self._groups)
        return {'a': self.query_names, 'b': self.target_names}

    def set_groups(self, groups: dict[str, list[str]]) -> None:
        """Set custom group assignments for sequence names.

        Parameters
        ----------
        groups : dict[str, list[str]]
            Mapping of group label → list of sequence names belonging to
            that group.

        Warns
        -----
        Logs a warning for every sequence name that appears in more than one
        group.
        """
        seen: dict[str, str] = {}
        for group, names in groups.items():
            for name in names:
                if name in seen:
                    _log.warning(
                        'PafAlignment.set_groups: sequence %r is assigned to '
                        'both group %r and group %r',
                        name,
                        seen[name],
                        group,
                    )
                else:
                    seen[name] = group
        self._groups = {g: list(ns) for g, ns in groups.items()}

    def rename_group(self, old_name: str, new_name: str) -> None:
        """Rename a group label.

        If custom groups have not been set yet, the default assignment
        (``'a'`` → query names, ``'b'`` → target names) is materialised
        first.

        Parameters
        ----------
        old_name : str
            Current group label.
        new_name : str
            New group label.

        Raises
        ------
        KeyError
            If *old_name* is not a known group.
        ValueError
            If *new_name* already exists as a different group label.
        """
        current = self._groups if self._groups is not None else self.groups
        if old_name not in current:
            raise KeyError(f'Group {old_name!r} not found.')
        if new_name in current and new_name != old_name:
            raise ValueError(f'Group {new_name!r} already exists.')
        self._groups = {
            (new_name if k == old_name else k): v for k, v in current.items()
        }
        _log.info('PafAlignment: renamed group %r%r', old_name, new_name)

    # ------------------------------------------------------------------
    # Contig reordering
    # ------------------------------------------------------------------

    def reorder_contigs(
        self,
        query_names: list[str] | None = None,
        target_names: list[str] | None = None,
        query_group: str | None = None,
        target_group: str | None = None,
    ) -> tuple[list[str], list[str]]:
        """Sort query and target contigs to maximise collinearity in the dotplot.

        Uses the gravity-centre algorithm: each contig is assigned a gravity
        equal to the weighted mean position of its alignment blocks on the
        opposing axis.  Contigs are then sorted by ascending gravity.

        Parameters
        ----------
        query_names : list[str] or None, optional
            Query contigs to reorder.  Ignored when *query_group* is given.
            Defaults to :attr:`query_names`.
        target_names : list[str] or None, optional
            Target contigs to reorder.  Ignored when *target_group* is given.
            Defaults to :attr:`target_names`.
        query_group : str or None, optional
            Group label whose members are used as query contigs.  When
            provided, the corresponding entry in :attr:`groups` is used and
            *query_names* is ignored.
        target_group : str or None, optional
            Group label whose members are used as target contigs.  When
            provided, the corresponding entry in :attr:`groups` is used and
            *target_names* is ignored.

        Returns
        -------
        tuple[list[str], list[str]]
            ``(sorted_query_names, sorted_target_names)``.

        Raises
        ------
        KeyError
            If a supplied group label is not present in :attr:`groups`.
        """
        current_groups = self.groups
        if query_group is not None:
            if query_group not in current_groups:
                raise KeyError(f'Group {query_group!r} not found.')
            q = current_groups[query_group]
        else:
            q = query_names if query_names is not None else self.query_names

        if target_group is not None:
            if target_group not in current_groups:
                raise KeyError(f'Group {target_group!r} not found.')
            t = current_groups[target_group]
        else:
            t = target_names if target_names is not None else self.target_names

        return compute_gravity_contigs(self.records, q, t)

Attributes

query_names property

Return a deduplicated list of query sequence names (insertion order).

Returns:

Type Description
list[str]

Unique query names in the order first seen.

target_names property

Return a deduplicated list of target sequence names (insertion order).

Returns:

Type Description
list[str]

Unique target names in the order first seen.

groups property

Return the current group assignments.

If groups have not been set explicitly via :meth:set_groups or :meth:rename_group, returns the default: all query sequence names in group 'a' and all target sequence names in group 'b'.

Returns:

Type Description
dict[str, list[str]]

Mapping of group label → list of sequence names.

Functions

from_file(path) classmethod

Load records from a PAF file.

Parameters:

Name Type Description Default
path str or Path

Path to the PAF file.

required

Returns:

Type Description
PafAlignment

New instance with all records loaded.

Source code in rusty_dot/paf_io.py
@classmethod
def from_file(cls, path: str | Path) -> 'PafAlignment':
    """Load records from a PAF file.

    Parameters
    ----------
    path : str or Path
        Path to the PAF file.

    Returns
    -------
    PafAlignment
        New instance with all records loaded.
    """
    return cls(list(parse_paf_file(path)))

from_records(records) classmethod

Construct from an iterable of :class:PafRecord objects.

Parameters:

Name Type Description Default
records iterable of PafRecord

Source records.

required

Returns:

Type Description
PafAlignment

New instance.

Source code in rusty_dot/paf_io.py
@classmethod
def from_records(cls, records: Iterable[PafRecord]) -> 'PafAlignment':
    """Construct from an iterable of :class:`PafRecord` objects.

    Parameters
    ----------
    records : iterable of PafRecord
        Source records.

    Returns
    -------
    PafAlignment
        New instance.
    """
    return cls(list(records))

sequence_names()

Return a deduplicated list of all query and target sequence names.

The list contains each name at most once, in the order it was first encountered (queries before targets within each record). This method makes :class:PafAlignment compatible with :class:~rusty_dot.dotplot.DotPlotter.

Returns:

Type Description
list[str]

All unique sequence names across query and target fields.

Source code in rusty_dot/paf_io.py
def sequence_names(self) -> list[str]:
    """Return a deduplicated list of all query and target sequence names.

    The list contains each name at most once, in the order it was first
    encountered (queries before targets within each record).  This method
    makes :class:`PafAlignment` compatible with :class:`~rusty_dot.dotplot.DotPlotter`.

    Returns
    -------
    list[str]
        All unique sequence names across query and target fields.
    """
    seen: dict[str, None] = {}
    for rec in self.records:
        seen[rec.query_name] = None
        seen[rec.target_name] = None
    return list(seen)

get_sequence_length(name)

Return the length of a sequence by name as stored in PAF records.

Looks up name in the query_name and target_name fields of every record and returns the corresponding query_len or target_len. This method makes :class:PafAlignment compatible with :class:~rusty_dot.dotplot.DotPlotter.

Parameters:

Name Type Description Default
name str

Sequence name to look up.

required

Returns:

Type Description
int

Length of the sequence.

Raises:

Type Description
KeyError

If name is not found in any record.

Source code in rusty_dot/paf_io.py
def get_sequence_length(self, name: str) -> int:
    """Return the length of a sequence by name as stored in PAF records.

    Looks up *name* in the ``query_name`` and ``target_name`` fields of
    every record and returns the corresponding ``query_len`` or
    ``target_len``.  This method makes :class:`PafAlignment` compatible
    with :class:`~rusty_dot.dotplot.DotPlotter`.

    Parameters
    ----------
    name : str
        Sequence name to look up.

    Returns
    -------
    int
        Length of the sequence.

    Raises
    ------
    KeyError
        If *name* is not found in any record.
    """
    for rec in self.records:
        if rec.query_name == name:
            return rec.query_len
        if rec.target_name == name:
            return rec.target_len
    raise KeyError(f'Sequence {name!r} not found in PAF records.')

__len__()

Return the number of records.

Returns:

Type Description
int

Record count.

Source code in rusty_dot/paf_io.py
def __len__(self) -> int:
    """Return the number of records.

    Returns
    -------
    int
        Record count.
    """
    return len(self.records)

__repr__()

Return a concise string representation.

Returns:

Type Description
str

PafAlignment(records=<n>, queries=<q>, targets=<t>).

Source code in rusty_dot/paf_io.py
def __repr__(self) -> str:
    """Return a concise string representation.

    Returns
    -------
    str
        ``PafAlignment(records=<n>, queries=<q>, targets=<t>)``.
    """
    return (
        f'PafAlignment(records={len(self.records)}, '
        f'queries={len(self.query_names)}, '
        f'targets={len(self.target_names)})'
    )

filter_by_query(names)

Return a new :class:PafAlignment containing only the given query names.

Parameters:

Name Type Description Default
names iterable of str

Query names to keep.

required

Returns:

Type Description
PafAlignment

Filtered alignment.

Source code in rusty_dot/paf_io.py
def filter_by_query(self, names: Iterable[str]) -> 'PafAlignment':
    """Return a new :class:`PafAlignment` containing only the given query names.

    Parameters
    ----------
    names : iterable of str
        Query names to keep.

    Returns
    -------
    PafAlignment
        Filtered alignment.
    """
    keep = set(names)
    return PafAlignment([r for r in self.records if r.query_name in keep])

filter_by_target(names)

Return a new :class:PafAlignment containing only the given target names.

Parameters:

Name Type Description Default
names iterable of str

Target names to keep.

required

Returns:

Type Description
PafAlignment

Filtered alignment.

Source code in rusty_dot/paf_io.py
def filter_by_target(self, names: Iterable[str]) -> 'PafAlignment':
    """Return a new :class:`PafAlignment` containing only the given target names.

    Parameters
    ----------
    names : iterable of str
        Target names to keep.

    Returns
    -------
    PafAlignment
        Filtered alignment.
    """
    keep = set(names)
    return PafAlignment([r for r in self.records if r.target_name in keep])

filter_by_min_length(min_length)

Return a new :class:PafAlignment keeping only records of sufficient length.

Filters on the query aligned length (query_end - query_start), which equals the alignment block span for both merged k-mer runs and PAF alignments imported from a file.

Parameters:

Name Type Description Default
min_length int

Minimum alignment length (inclusive). Records with a query aligned length strictly less than min_length are discarded.

required

Returns:

Type Description
PafAlignment

Filtered alignment containing only records with query_aligned_len >= min_length.

Source code in rusty_dot/paf_io.py
def filter_by_min_length(self, min_length: int) -> 'PafAlignment':
    """Return a new :class:`PafAlignment` keeping only records of sufficient length.

    Filters on the query aligned length (``query_end - query_start``), which
    equals the alignment block span for both merged k-mer runs and PAF
    alignments imported from a file.

    Parameters
    ----------
    min_length : int
        Minimum alignment length (inclusive).  Records with a query aligned
        length strictly less than ``min_length`` are discarded.

    Returns
    -------
    PafAlignment
        Filtered alignment containing only records with
        ``query_aligned_len >= min_length``.
    """
    return PafAlignment(
        [r for r in self.records if r.query_aligned_len >= min_length]
    )

set_groups(groups)

Set custom group assignments for sequence names.

Parameters:

Name Type Description Default
groups dict[str, list[str]]

Mapping of group label → list of sequence names belonging to that group.

required

Warns:

Type Description
Logs a warning for every sequence name that appears in more than one
group.
Source code in rusty_dot/paf_io.py
def set_groups(self, groups: dict[str, list[str]]) -> None:
    """Set custom group assignments for sequence names.

    Parameters
    ----------
    groups : dict[str, list[str]]
        Mapping of group label → list of sequence names belonging to
        that group.

    Warns
    -----
    Logs a warning for every sequence name that appears in more than one
    group.
    """
    seen: dict[str, str] = {}
    for group, names in groups.items():
        for name in names:
            if name in seen:
                _log.warning(
                    'PafAlignment.set_groups: sequence %r is assigned to '
                    'both group %r and group %r',
                    name,
                    seen[name],
                    group,
                )
            else:
                seen[name] = group
    self._groups = {g: list(ns) for g, ns in groups.items()}

rename_group(old_name, new_name)

Rename a group label.

If custom groups have not been set yet, the default assignment ('a' → query names, 'b' → target names) is materialised first.

Parameters:

Name Type Description Default
old_name str

Current group label.

required
new_name str

New group label.

required

Raises:

Type Description
KeyError

If old_name is not a known group.

ValueError

If new_name already exists as a different group label.

Source code in rusty_dot/paf_io.py
def rename_group(self, old_name: str, new_name: str) -> None:
    """Rename a group label.

    If custom groups have not been set yet, the default assignment
    (``'a'`` → query names, ``'b'`` → target names) is materialised
    first.

    Parameters
    ----------
    old_name : str
        Current group label.
    new_name : str
        New group label.

    Raises
    ------
    KeyError
        If *old_name* is not a known group.
    ValueError
        If *new_name* already exists as a different group label.
    """
    current = self._groups if self._groups is not None else self.groups
    if old_name not in current:
        raise KeyError(f'Group {old_name!r} not found.')
    if new_name in current and new_name != old_name:
        raise ValueError(f'Group {new_name!r} already exists.')
    self._groups = {
        (new_name if k == old_name else k): v for k, v in current.items()
    }
    _log.info('PafAlignment: renamed group %r%r', old_name, new_name)

reorder_contigs(query_names=None, target_names=None, query_group=None, target_group=None)

Sort query and target contigs to maximise collinearity in the dotplot.

Uses the gravity-centre algorithm: each contig is assigned a gravity equal to the weighted mean position of its alignment blocks on the opposing axis. Contigs are then sorted by ascending gravity.

Parameters:

Name Type Description Default
query_names list[str] or None

Query contigs to reorder. Ignored when query_group is given. Defaults to :attr:query_names.

None
target_names list[str] or None

Target contigs to reorder. Ignored when target_group is given. Defaults to :attr:target_names.

None
query_group str or None

Group label whose members are used as query contigs. When provided, the corresponding entry in :attr:groups is used and query_names is ignored.

None
target_group str or None

Group label whose members are used as target contigs. When provided, the corresponding entry in :attr:groups is used and target_names is ignored.

None

Returns:

Type Description
tuple[list[str], list[str]]

(sorted_query_names, sorted_target_names).

Raises:

Type Description
KeyError

If a supplied group label is not present in :attr:groups.

Source code in rusty_dot/paf_io.py
def reorder_contigs(
    self,
    query_names: list[str] | None = None,
    target_names: list[str] | None = None,
    query_group: str | None = None,
    target_group: str | None = None,
) -> tuple[list[str], list[str]]:
    """Sort query and target contigs to maximise collinearity in the dotplot.

    Uses the gravity-centre algorithm: each contig is assigned a gravity
    equal to the weighted mean position of its alignment blocks on the
    opposing axis.  Contigs are then sorted by ascending gravity.

    Parameters
    ----------
    query_names : list[str] or None, optional
        Query contigs to reorder.  Ignored when *query_group* is given.
        Defaults to :attr:`query_names`.
    target_names : list[str] or None, optional
        Target contigs to reorder.  Ignored when *target_group* is given.
        Defaults to :attr:`target_names`.
    query_group : str or None, optional
        Group label whose members are used as query contigs.  When
        provided, the corresponding entry in :attr:`groups` is used and
        *query_names* is ignored.
    target_group : str or None, optional
        Group label whose members are used as target contigs.  When
        provided, the corresponding entry in :attr:`groups` is used and
        *target_names* is ignored.

    Returns
    -------
    tuple[list[str], list[str]]
        ``(sorted_query_names, sorted_target_names)``.

    Raises
    ------
    KeyError
        If a supplied group label is not present in :attr:`groups`.
    """
    current_groups = self.groups
    if query_group is not None:
        if query_group not in current_groups:
            raise KeyError(f'Group {query_group!r} not found.')
        q = current_groups[query_group]
    else:
        q = query_names if query_names is not None else self.query_names

    if target_group is not None:
        if target_group not in current_groups:
            raise KeyError(f'Group {target_group!r} not found.')
        t = current_groups[target_group]
    else:
        t = target_names if target_names is not None else self.target_names

    return compute_gravity_contigs(self.records, q, t)

Functions

parse_paf_file(path)

Yield :class:PafRecord objects from a PAF file.

Lines beginning with # are treated as comments and skipped. Empty lines are also skipped.

Parameters:

Name Type Description Default
path str or Path

Path to the PAF file.

required

Yields:

Type Description
PafRecord

One record per non-comment, non-empty line.

Raises:

Type Description
FileNotFoundError

If path does not exist.

ValueError

If a line cannot be parsed as a PAF record.

Source code in rusty_dot/paf_io.py
def parse_paf_file(path: str | Path) -> Generator[PafRecord, None, None]:
    """Yield :class:`PafRecord` objects from a PAF file.

    Lines beginning with ``#`` are treated as comments and skipped.  Empty
    lines are also skipped.

    Parameters
    ----------
    path : str or Path
        Path to the PAF file.

    Yields
    ------
    PafRecord
        One record per non-comment, non-empty line.

    Raises
    ------
    FileNotFoundError
        If ``path`` does not exist.
    ValueError
        If a line cannot be parsed as a PAF record.
    """
    path = Path(path)
    with path.open('r', encoding='utf-8') as fh:
        for line in fh:
            line = line.rstrip('\n')
            if not line or line.startswith('#'):
                continue
            yield PafRecord.from_line(line)

compute_gravity_contigs(records, query_names, target_names)

Return query and target contig names sorted by gravity centre.

For each query contig the gravity centre is the weighted mean of target mid-point positions (normalised by the total target span) across all alignment records that involve that contig. Target contigs are sorted symmetrically against the query axis.

Contigs with no alignment records receive a gravity of float("inf") and are placed at the end of the sorted list.

Parameters:

Name Type Description Default
records iterable of PafRecord

Alignment records to use for computing gravity centres.

required
query_names list[str]

The query contig names to reorder.

required
target_names list[str]

The target contig names to reorder.

required

Returns:

Type Description
tuple[list[str], list[str]]

(sorted_query_names, sorted_target_names) ordered by ascending gravity centre.

Source code in rusty_dot/paf_io.py
def compute_gravity_contigs(
    records: Iterable[PafRecord],
    query_names: list[str],
    target_names: list[str],
) -> tuple[list[str], list[str]]:
    """Return query and target contig names sorted by gravity centre.

    For each query contig the gravity centre is the weighted mean of target
    mid-point positions (normalised by the total target span) across all
    alignment records that involve that contig.  Target contigs are sorted
    symmetrically against the query axis.

    Contigs with no alignment records receive a gravity of ``float("inf")``
    and are placed at the end of the sorted list.

    Parameters
    ----------
    records : iterable of PafRecord
        Alignment records to use for computing gravity centres.
    query_names : list[str]
        The query contig names to reorder.
    target_names : list[str]
        The target contig names to reorder.

    Returns
    -------
    tuple[list[str], list[str]]
        ``(sorted_query_names, sorted_target_names)`` ordered by ascending
        gravity centre.
    """
    query_set = set(query_names)
    target_set = set(target_names)

    # Collect all records into a list and build sequence-length maps from them.
    q_len_map: dict[str, int] = {}
    t_len_map: dict[str, int] = {}
    all_records: list[PafRecord] = []
    for rec in records:
        all_records.append(rec)
        q_len_map[rec.query_name] = rec.query_len
        t_len_map[rec.target_name] = rec.target_len

    # Build cumulative target offsets using actual sequence lengths.
    t_offsets: dict[str, int] = {}
    t_off = 0
    for t in target_names:
        t_offsets[t] = t_off
        t_off += t_len_map.get(t, 1)
    total_target_len = max(t_off, 1)

    # Build cumulative query offsets using actual sequence lengths.
    q_offsets_real: dict[str, int] = {}
    q_off = 0
    for q in query_names:
        q_offsets_real[q] = q_off
        q_off += q_len_map.get(q, 1)
    total_query_len = max(q_off, 1)

    # Accumulate weighted positions.
    q_weight: dict[str, float] = dict.fromkeys(query_names, 0.0)
    q_wpos: dict[str, float] = dict.fromkeys(query_names, 0.0)
    t_weight: dict[str, float] = dict.fromkeys(target_names, 0.0)
    t_wpos: dict[str, float] = dict.fromkeys(target_names, 0.0)

    for rec in all_records:
        if rec.query_name not in query_set or rec.target_name not in target_set:
            continue
        size = float(rec.alignment_block_len or (rec.query_end - rec.query_start))
        if size <= 0:
            continue

        # Target gravity from query's perspective.
        t_mid = (
            t_offsets.get(rec.target_name, 0)
            + (rec.target_start + rec.target_end) / 2.0
        )
        q_weight[rec.query_name] += size
        q_wpos[rec.query_name] += size * t_mid

        # Query gravity from target's perspective.
        q_mid = (
            q_offsets_real.get(rec.query_name, 0)
            + (rec.query_start + rec.query_end) / 2.0
        )
        t_weight[rec.target_name] += size
        t_wpos[rec.target_name] += size * q_mid

    def _gravity(name: str, wt: dict, wp: dict, total: float) -> float:
        w = wt.get(name, 0.0)
        return (wp.get(name, 0.0) / w / total) if w > 0 else float('inf')

    def _sort_key_with_len(
        name: str,
        wt: dict,
        wp: dict,
        total: float,
        len_map: dict,
    ) -> tuple:
        g = _gravity(name, wt, wp, total)
        if g == float('inf'):
            # Unmatched: sort after matched (1 > 0), then by descending length
            return (1, -len_map.get(name, 0))
        return (0, g)

    sorted_q = sorted(
        query_names,
        key=lambda n: _sort_key_with_len(
            n, q_weight, q_wpos, total_target_len, q_len_map
        ),
    )
    sorted_t = sorted(
        target_names,
        key=lambda n: _sort_key_with_len(
            n, t_weight, t_wpos, total_query_len, t_len_map
        ),
    )
    return sorted_q, sorted_t