Skip to content

Metadata Handling

Sequence metadata and trait management.

Overview

The metadata module handles associated information for sequences:

  • Population assignments
  • Geographic locations
  • Temporal data
  • Custom traits

Modules

pypopart.io.metadata

Metadata file reader and writer for PyPopART.

Handles CSV-based metadata/traits files that can be linked to sequences. Includes support for geographic coordinates (latitude/longitude).

MetadataReader

Reader for CSV-based metadata files.

Source code in src/pypopart/io/metadata.py
class MetadataReader:
    """Reader for CSV-based metadata files."""

    def __init__(
        self,
        filepath: Union[str, Path],
        id_column: str = 'id',
        delimiter: str = ',',
        validate: bool = True,
    ):
        """
        Initialize metadata reader.

        Parameters
        ----------
        filepath :
            Path to metadata CSV file.
        id_column :
            Name of column containing sequence IDs.
        delimiter :
            CSV delimiter character.
        validate :
            Whether to validate metadata.
        """
        self.filepath = Path(filepath)
        self.id_column = id_column
        self.delimiter = delimiter
        self.validate = validate

        if not self.filepath.exists():
            raise FileNotFoundError(f'File not found: {filepath}')

    def _open_file(self) -> TextIO:
        """Open file handling gzip compression."""
        if self.filepath.suffix == '.gz':
            return gzip.open(self.filepath, 'rt')
        else:
            return open(self.filepath, 'r', encoding='utf-8')

    def read_metadata(self) -> Dict[str, Dict[str, str]]:
        """
        Read metadata from CSV file.

        Returns
        -------
            Dictionary mapping sequence IDs to metadata dictionaries.
        """
        metadata = {}

        with self._open_file() as handle:
            reader = csv.DictReader(handle, delimiter=self.delimiter)

            if self.id_column not in reader.fieldnames:
                raise ValueError(
                    f"ID column '{self.id_column}' not found in CSV. "
                    f'Available columns: {reader.fieldnames}'
                )

            for row in reader:
                seq_id = row[self.id_column]

                # Remove ID column from metadata
                row_metadata = {k: v for k, v in row.items() if k != self.id_column}

                if seq_id in metadata and self.validate:
                    raise ValueError(f'Duplicate sequence ID in metadata: {seq_id}')

                metadata[seq_id] = row_metadata

        return metadata

    def apply_to_alignment(self, alignment: Alignment) -> None:
        """
        Apply metadata to sequences in alignment.

        Parameters
        ----------
        alignment :
            Alignment object to update.
        """
        metadata = self.read_metadata()

        missing_ids = []
        for seq in alignment:
            if seq.id in metadata:
                # Update sequence metadata
                seq.metadata.update(metadata[seq.id])
            elif self.validate:
                missing_ids.append(seq.id)

        if missing_ids and self.validate:
            raise ValueError(
                f'Metadata missing for {len(missing_ids)} sequences: '
                f'{", ".join(missing_ids[:5])}{"..." if len(missing_ids) > 5 else ""}'
            )
__init__
__init__(
    filepath: Union[str, Path],
    id_column: str = "id",
    delimiter: str = ",",
    validate: bool = True,
)

Initialize metadata reader.

Parameters:

Name Type Description Default
filepath Union[str, Path]

Path to metadata CSV file.

required
id_column str

Name of column containing sequence IDs.

'id'
delimiter str

CSV delimiter character.

','
validate bool

Whether to validate metadata.

True
Source code in src/pypopart/io/metadata.py
def __init__(
    self,
    filepath: Union[str, Path],
    id_column: str = 'id',
    delimiter: str = ',',
    validate: bool = True,
):
    """
    Initialize metadata reader.

    Parameters
    ----------
    filepath :
        Path to metadata CSV file.
    id_column :
        Name of column containing sequence IDs.
    delimiter :
        CSV delimiter character.
    validate :
        Whether to validate metadata.
    """
    self.filepath = Path(filepath)
    self.id_column = id_column
    self.delimiter = delimiter
    self.validate = validate

    if not self.filepath.exists():
        raise FileNotFoundError(f'File not found: {filepath}')
read_metadata
read_metadata() -> Dict[str, Dict[str, str]]

Read metadata from CSV file.

Returns:

Type Description
Dictionary mapping sequence IDs to metadata dictionaries.
Source code in src/pypopart/io/metadata.py
def read_metadata(self) -> Dict[str, Dict[str, str]]:
    """
    Read metadata from CSV file.

    Returns
    -------
        Dictionary mapping sequence IDs to metadata dictionaries.
    """
    metadata = {}

    with self._open_file() as handle:
        reader = csv.DictReader(handle, delimiter=self.delimiter)

        if self.id_column not in reader.fieldnames:
            raise ValueError(
                f"ID column '{self.id_column}' not found in CSV. "
                f'Available columns: {reader.fieldnames}'
            )

        for row in reader:
            seq_id = row[self.id_column]

            # Remove ID column from metadata
            row_metadata = {k: v for k, v in row.items() if k != self.id_column}

            if seq_id in metadata and self.validate:
                raise ValueError(f'Duplicate sequence ID in metadata: {seq_id}')

            metadata[seq_id] = row_metadata

    return metadata
apply_to_alignment
apply_to_alignment(alignment: Alignment) -> None

Apply metadata to sequences in alignment.

Parameters:

Name Type Description Default
alignment Alignment

Alignment object to update.

required
Source code in src/pypopart/io/metadata.py
def apply_to_alignment(self, alignment: Alignment) -> None:
    """
    Apply metadata to sequences in alignment.

    Parameters
    ----------
    alignment :
        Alignment object to update.
    """
    metadata = self.read_metadata()

    missing_ids = []
    for seq in alignment:
        if seq.id in metadata:
            # Update sequence metadata
            seq.metadata.update(metadata[seq.id])
        elif self.validate:
            missing_ids.append(seq.id)

    if missing_ids and self.validate:
        raise ValueError(
            f'Metadata missing for {len(missing_ids)} sequences: '
            f'{", ".join(missing_ids[:5])}{"..." if len(missing_ids) > 5 else ""}'
        )

MetadataWriter

Writer for CSV-based metadata files.

Source code in src/pypopart/io/metadata.py
class MetadataWriter:
    """Writer for CSV-based metadata files."""

    def __init__(
        self,
        filepath: Union[str, Path],
        id_column: str = 'id',
        delimiter: str = ',',
        compress: Optional[str] = None,
    ):
        """
        Initialize metadata writer.

        Parameters
        ----------
        filepath :
            Output file path.
        id_column :
            Name of column for sequence IDs.
        delimiter :
            CSV delimiter character.
        compress :
            Compression format ('gzip' or None).
        """
        self.filepath = Path(filepath)
        self.id_column = id_column
        self.delimiter = delimiter
        self.compress = compress

        if compress == 'gzip' and not str(self.filepath).endswith('.gz'):
            self.filepath = Path(str(self.filepath) + '.gz')

    def _open_file(self) -> TextIO:
        """Open file for writing with optional compression."""
        if self.compress == 'gzip':
            return gzip.open(self.filepath, 'wt', encoding='utf-8')
        else:
            return open(self.filepath, 'w', encoding='utf-8', newline='')

    def write_metadata(
        self,
        metadata: Dict[str, Dict[str, str]],
        trait_order: Optional[List[str]] = None,
    ) -> None:
        """
        Write metadata to CSV file.

        Parameters
        ----------
        metadata :
            Dictionary mapping sequence IDs to metadata dictionaries.
        trait_order :
            Optional list specifying order of trait columns.
        """
        if not metadata:
            raise ValueError('No metadata to write')

        # Determine all trait keys
        all_traits = set()
        for traits in metadata.values():
            all_traits.update(traits.keys())

        # Order traits
        if trait_order:
            # Use specified order, then add any remaining traits
            fieldnames = [self.id_column] + trait_order
            remaining = sorted(all_traits - set(trait_order))
            fieldnames.extend(remaining)
        else:
            fieldnames = [self.id_column] + sorted(all_traits)

        with self._open_file() as handle:
            writer = csv.DictWriter(
                handle,
                fieldnames=fieldnames,
                delimiter=self.delimiter,
                extrasaction='ignore',
            )

            writer.writeheader()

            for seq_id, traits in sorted(metadata.items()):
                row = {self.id_column: seq_id}
                row.update(traits)
                writer.writerow(row)

    def write_from_alignment(
        self, alignment: Alignment, trait_order: Optional[List[str]] = None
    ) -> None:
        """
        Extract and write metadata from alignment.

        Parameters
        ----------
        alignment :
            Alignment object.
        trait_order :
            Optional list specifying order of trait columns.
        """
        metadata = {}

        for seq in alignment:
            if seq.metadata:
                metadata[seq.id] = seq.metadata

        if not metadata:
            raise ValueError('No metadata found in alignment')

        self.write_metadata(metadata, trait_order)
__init__
__init__(
    filepath: Union[str, Path],
    id_column: str = "id",
    delimiter: str = ",",
    compress: Optional[str] = None,
)

Initialize metadata writer.

Parameters:

Name Type Description Default
filepath Union[str, Path]

Output file path.

required
id_column str

Name of column for sequence IDs.

'id'
delimiter str

CSV delimiter character.

','
compress Optional[str]

Compression format ('gzip' or None).

None
Source code in src/pypopart/io/metadata.py
def __init__(
    self,
    filepath: Union[str, Path],
    id_column: str = 'id',
    delimiter: str = ',',
    compress: Optional[str] = None,
):
    """
    Initialize metadata writer.

    Parameters
    ----------
    filepath :
        Output file path.
    id_column :
        Name of column for sequence IDs.
    delimiter :
        CSV delimiter character.
    compress :
        Compression format ('gzip' or None).
    """
    self.filepath = Path(filepath)
    self.id_column = id_column
    self.delimiter = delimiter
    self.compress = compress

    if compress == 'gzip' and not str(self.filepath).endswith('.gz'):
        self.filepath = Path(str(self.filepath) + '.gz')
write_metadata
write_metadata(
    metadata: Dict[str, Dict[str, str]],
    trait_order: Optional[List[str]] = None,
) -> None

Write metadata to CSV file.

Parameters:

Name Type Description Default
metadata Dict[str, Dict[str, str]]

Dictionary mapping sequence IDs to metadata dictionaries.

required
trait_order Optional[List[str]]

Optional list specifying order of trait columns.

None
Source code in src/pypopart/io/metadata.py
def write_metadata(
    self,
    metadata: Dict[str, Dict[str, str]],
    trait_order: Optional[List[str]] = None,
) -> None:
    """
    Write metadata to CSV file.

    Parameters
    ----------
    metadata :
        Dictionary mapping sequence IDs to metadata dictionaries.
    trait_order :
        Optional list specifying order of trait columns.
    """
    if not metadata:
        raise ValueError('No metadata to write')

    # Determine all trait keys
    all_traits = set()
    for traits in metadata.values():
        all_traits.update(traits.keys())

    # Order traits
    if trait_order:
        # Use specified order, then add any remaining traits
        fieldnames = [self.id_column] + trait_order
        remaining = sorted(all_traits - set(trait_order))
        fieldnames.extend(remaining)
    else:
        fieldnames = [self.id_column] + sorted(all_traits)

    with self._open_file() as handle:
        writer = csv.DictWriter(
            handle,
            fieldnames=fieldnames,
            delimiter=self.delimiter,
            extrasaction='ignore',
        )

        writer.writeheader()

        for seq_id, traits in sorted(metadata.items()):
            row = {self.id_column: seq_id}
            row.update(traits)
            writer.writerow(row)
write_from_alignment
write_from_alignment(
    alignment: Alignment,
    trait_order: Optional[List[str]] = None,
) -> None

Extract and write metadata from alignment.

Parameters:

Name Type Description Default
alignment Alignment

Alignment object.

required
trait_order Optional[List[str]]

Optional list specifying order of trait columns.

None
Source code in src/pypopart/io/metadata.py
def write_from_alignment(
    self, alignment: Alignment, trait_order: Optional[List[str]] = None
) -> None:
    """
    Extract and write metadata from alignment.

    Parameters
    ----------
    alignment :
        Alignment object.
    trait_order :
        Optional list specifying order of trait columns.
    """
    metadata = {}

    for seq in alignment:
        if seq.metadata:
            metadata[seq.id] = seq.metadata

    if not metadata:
        raise ValueError('No metadata found in alignment')

    self.write_metadata(metadata, trait_order)

parse_coordinate

parse_coordinate(value: str) -> float

Parse a coordinate string to a float.

Handles various formats: - Decimal degrees: "45.5", "-123.4" - With degree symbol: "45.5°", "-123.4°"

Args: value: Coordinate string

Returns:

Type Description
Coordinate as float.

Raises:

Type Description
ValueError: If coordinate cannot be parsed
Source code in src/pypopart/io/metadata.py
def parse_coordinate(value: str) -> float:
    """
    Parse a coordinate string to a float.

    Handles various formats:
    - Decimal degrees: "45.5", "-123.4"
    - With degree symbol: "45.5°", "-123.4°"

    Args:
        value: Coordinate string

    Returns
    -------
        Coordinate as float.

    Raises
    ------
        ValueError: If coordinate cannot be parsed
    """
    try:
        # Remove degree symbol and whitespace
        cleaned = value.strip().replace('°', '').replace(' ', '')
        return float(cleaned)
    except (ValueError, AttributeError) as e:
        raise ValueError(f'Invalid coordinate value: {value}') from e

validate_latitude

validate_latitude(lat: float) -> None

Validate latitude value.

Args: lat: Latitude value

Raises:

Type Description
ValueError: If latitude is out of range [-90, 90]
Source code in src/pypopart/io/metadata.py
def validate_latitude(lat: float) -> None:
    """
    Validate latitude value.

    Args:
        lat: Latitude value

    Raises
    ------
        ValueError: If latitude is out of range [-90, 90]
    """
    if not -90 <= lat <= 90:
        raise ValueError(f'Latitude must be between -90 and 90, got {lat}')

validate_longitude

validate_longitude(lon: float) -> None

Validate longitude value.

Args: lon: Longitude value

Raises:

Type Description
ValueError: If longitude is out of range [-180, 180]
Source code in src/pypopart/io/metadata.py
def validate_longitude(lon: float) -> None:
    """
    Validate longitude value.

    Args:
        lon: Longitude value

    Raises
    ------
        ValueError: If longitude is out of range [-180, 180]
    """
    if not -180 <= lon <= 180:
        raise ValueError(f'Longitude must be between -180 and 180, got {lon}')

extract_coordinates

extract_coordinates(
    metadata: Dict[str, str],
    lat_column: str = "latitude",
    lon_column: str = "longitude",
    validate: bool = True,
) -> Optional[Tuple[float, float]]

Extract and validate geographic coordinates from metadata.

Args: metadata: Metadata dictionary lat_column: Name of latitude column lon_column: Name of longitude column validate: Whether to validate coordinate ranges

Returns:

Type Description
Tuple of (latitude, longitude) or None if coordinates not present.

Raises:

Type Description
ValueError: If coordinates are invalid
Source code in src/pypopart/io/metadata.py
def extract_coordinates(
    metadata: Dict[str, str],
    lat_column: str = 'latitude',
    lon_column: str = 'longitude',
    validate: bool = True,
) -> Optional[Tuple[float, float]]:
    """
    Extract and validate geographic coordinates from metadata.

    Args:
        metadata: Metadata dictionary
        lat_column: Name of latitude column
        lon_column: Name of longitude column
        validate: Whether to validate coordinate ranges

    Returns
    -------
        Tuple of (latitude, longitude) or None if coordinates not present.

    Raises
    ------
        ValueError: If coordinates are invalid
    """
    if lat_column not in metadata or lon_column not in metadata:
        return None

    lat = parse_coordinate(metadata[lat_column])
    lon = parse_coordinate(metadata[lon_column])

    if validate:
        validate_latitude(lat)
        validate_longitude(lon)

    return (lat, lon)