Quick Start Tutorial¶

This notebook demonstrates the core workflow of rusty-dot: building a sequence index, finding shared k-mer matches, generating PAF-format alignments, and visualising results as a dotplot.

Prerequisites¶

Install rusty-dot following the Installation guide, then run:

pip install jupyter

1. Import the package¶

In [ ]:

Copied!

import matplotlib.pyplot as plt

from rusty_dot import SequenceIndex
from rusty_dot.dotplot import DotPlotter

print('rusty-dot imported successfully')
import matplotlib.pyplot as plt

from rusty_dot import SequenceIndex
from rusty_dot.dotplot import DotPlotter

print('rusty-dot imported successfully')

2. Build a SequenceIndex¶

SequenceIndex is the core class. You supply the k-mer length k at construction time. A larger k reduces spurious matches but may miss shorter conserved regions.

In [ ]:

Copied!

# Create an index with k=10
idx = SequenceIndex(k=10)
print(idx)  # SequenceIndex(k=10, sequences=0)
# Create an index with k=10
idx = SequenceIndex(k=10)
print(idx)  # SequenceIndex(k=10, sequences=0)

3. Add sequences directly¶

Use add_sequence(name, seq) to add individual sequences. These are stored in-memory and indexed immediately.

In [ ]:

Copied!





# Two sequences with a shared region
seq_a = 'ACGTACGTACGTACGTACGT' * 5  # 100 bp
seq_b = 'TACGTACGTACGTACGTACG' * 5  # 100 bp (shifted by 1 bp)
seq_c = 'GCGCGCGCGCGCGCGCGCGC' * 5  # 100 bp (different content)

idx.add_sequence('seq_a', seq_a)
idx.add_sequence('seq_b', seq_b)
idx.add_sequence('seq_c', seq_c)

print(f'Indexed sequences: {idx.sequence_names()}')
print(f'Total sequences: {len(idx)}')
# Two sequences with a shared region
seq_a = 'ACGTACGTACGTACGTACGT' * 5  # 100 bp
seq_b = 'TACGTACGTACGTACGTACG' * 5  # 100 bp (shifted by 1 bp)
seq_c = 'GCGCGCGCGCGCGCGCGCGC' * 5  # 100 bp (different content)

idx.add_sequence('seq_a', seq_a)
idx.add_sequence('seq_b', seq_b)
idx.add_sequence('seq_c', seq_c)

print(f'Indexed sequences: {idx.sequence_names()}')
print(f'Total sequences: {len(idx)}')

4. Query the index¶

4a. Inspect k-mer sets¶

In [ ]:

Copied!





# How many unique 10-mers does each sequence have?
for name in idx.sequence_names():
    kset = idx.get_kmer_set(name)
    print(
        f'{name}: {len(kset)} unique k-mers, length={idx.get_sequence_length(name)} bp'
    )
# How many unique 10-mers does each sequence have?
for name in idx.sequence_names():
    kset = idx.get_kmer_set(name)
    print(
        f'{name}: {len(kset)} unique k-mers, length={idx.get_sequence_length(name)} bp'
    )

4b. Find shared k-mer matches¶

compare_sequences intersects the k-mer sets and returns (query_start, query_end, target_start, target_end) tuples.

In [ ]:

Copied!





matches = idx.compare_sequences('seq_a', 'seq_b', merge=True)
print(f'seq_a vs seq_b: {len(matches)} match blocks')
for m in matches[:5]:
    q_start, q_end, t_start, t_end = m
    print(
        f'  query [{q_start}-{q_end}]  target [{t_start}-{t_end}]  length={q_end - q_start}'
    )
matches = idx.compare_sequences('seq_a', 'seq_b', merge=True)
print(f'seq_a vs seq_b: {len(matches)} match blocks')
for m in matches[:5]:
    q_start, q_end, t_start, t_end = m
    print(
        f'  query [{q_start}-{q_end}]  target [{t_start}-{t_end}]  length={q_end - q_start}'
    )

In [ ]:

Copied!

# Fewer matches between unrelated sequences
matches_ac = idx.compare_sequences('seq_a', 'seq_c', merge=True)
print(f'seq_a vs seq_c: {len(matches_ac)} match blocks')
# Fewer matches between unrelated sequences
matches_ac = idx.compare_sequences('seq_a', 'seq_c', merge=True)
print(f'seq_a vs seq_c: {len(matches_ac)} match blocks')

4c. Stranded matches¶

compare_sequences_stranded additionally searches for reverse-complement matches, returning a 5-tuple (q_start, q_end, t_start, t_end, strand) where strand is "+" or "-".

In [ ]:

Copied!





stranded = idx.compare_sequences_stranded('seq_a', 'seq_b', merge=True)
fwd = [m for m in stranded if m[4] == '+']
rev = [m for m in stranded if m[4] == '-']
print(f'Forward matches: {len(fwd)}, Reverse-complement matches: {len(rev)}')
stranded = idx.compare_sequences_stranded('seq_a', 'seq_b', merge=True)
fwd = [m for m in stranded if m[4] == '+']
rev = [m for m in stranded if m[4] == '-']
print(f'Forward matches: {len(fwd)}, Reverse-complement matches: {len(rev)}')

5. PAF output¶

PAF (Pairwise mApping Format) is a standard tab-separated format for alignment records. get_paf returns a list of 12-column PAF strings.

In [ ]:

Copied!





paf_lines = idx.get_paf('seq_a', 'seq_b', merge=True)
print(f'PAF lines: {len(paf_lines)}')
print('\nFirst PAF line:')
if paf_lines:
    fields = paf_lines[0].split('\t')
    labels = [
        'query',
        'q_len',
        'q_start',
        'q_end',
        'strand',
        'target',
        't_len',
        't_start',
        't_end',
        'matches',
        'block_len',
        'mapq',
    ]
    for label, value in zip(labels, fields):
        print(f'  {label:12s}: {value}')
paf_lines = idx.get_paf('seq_a', 'seq_b', merge=True)
print(f'PAF lines: {len(paf_lines)}')
print('\nFirst PAF line:')
if paf_lines:
    fields = paf_lines[0].split('\t')
    labels = [
        'query',
        'q_len',
        'q_start',
        'q_end',
        'strand',
        'target',
        't_len',
        't_start',
        't_end',
        'matches',
        'block_len',
        'mapq',
    ]
    for label, value in zip(labels, fields):
        print(f'  {label:12s}: {value}')

In [ ]:

Copied!





# Save PAF output to a file
import os
import tempfile

with tempfile.NamedTemporaryFile(mode='w', suffix='.paf', delete=False) as fh:
    paf_path = fh.name
    for line in paf_lines:
        fh.write(line + '\n')

print(f'PAF saved to: {paf_path}')
print(f'File size: {os.path.getsize(paf_path)} bytes')
# Save PAF output to a file
import os
import tempfile

with tempfile.NamedTemporaryFile(mode='w', suffix='.paf', delete=False) as fh:
    paf_path = fh.name
    for line in paf_lines:
        fh.write(line + '\n')

print(f'PAF saved to: {paf_path}')
print(f'File size: {os.path.getsize(paf_path)} bytes')

6. Load from FASTA files¶

load_fasta reads plain or gzipped FASTA files with automatic format detection.

In [ ]:

Copied!





import gzip
import tempfile

fasta_content = '>seq1\nACGTACGTACGTACGTACGT\n>seq2\nTACGTACGTACGTACGTACG\n'

# Plain FASTA
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as fh:
    fh.write(fasta_content)
    plain_path = fh.name

# Gzipped FASTA
with tempfile.NamedTemporaryFile(suffix='.fasta.gz', delete=False) as fh:
    gz_path = fh.name
with gzip.open(gz_path, 'wt') as fh:
    fh.write(fasta_content)

# Load into a fresh index
idx2 = SequenceIndex(k=10)
names = idx2.load_fasta(plain_path)
print(f'Loaded from plain FASTA: {names}')

idx3 = SequenceIndex(k=10)
names_gz = idx3.load_fasta(gz_path)
print(f'Loaded from gzipped FASTA: {names_gz}')
import gzip
import tempfile

fasta_content = '>seq1\nACGTACGTACGTACGTACGT\n>seq2\nTACGTACGTACGTACGTACG\n'

# Plain FASTA
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as fh:
    fh.write(fasta_content)
    plain_path = fh.name

# Gzipped FASTA
with tempfile.NamedTemporaryFile(suffix='.fasta.gz', delete=False) as fh:
    gz_path = fh.name
with gzip.open(gz_path, 'wt') as fh:
    fh.write(fasta_content)

# Load into a fresh index
idx2 = SequenceIndex(k=10)
names = idx2.load_fasta(plain_path)
print(f'Loaded from plain FASTA: {names}')

idx3 = SequenceIndex(k=10)
names_gz = idx3.load_fasta(gz_path)
print(f'Loaded from gzipped FASTA: {names_gz}')

7. Save and load the index¶

Indexes can be serialised to disk and reloaded, avoiding the need to reprocess large FASTA files.

In [ ]:

Copied!





import tempfile

# Save
with tempfile.NamedTemporaryFile(suffix='.bin', delete=False) as fh:
    idx_path = fh.name

idx.save(idx_path)
print(f'Index saved to: {idx_path}')
print(f'File size: {os.path.getsize(idx_path)} bytes')

# Load into a new index (k must match)
idx_loaded = SequenceIndex(k=10)
idx_loaded.load(idx_path)
print(f'Loaded index: {idx_loaded}')
print(f'Sequences: {idx_loaded.sequence_names()}')
import tempfile

# Save
with tempfile.NamedTemporaryFile(suffix='.bin', delete=False) as fh:
    idx_path = fh.name

idx.save(idx_path)
print(f'Index saved to: {idx_path}')
print(f'File size: {os.path.getsize(idx_path)} bytes')

# Load into a new index (k must match)
idx_loaded = SequenceIndex(k=10)
idx_loaded.load(idx_path)
print(f'Loaded index: {idx_loaded}')
print(f'Sequences: {idx_loaded.sequence_names()}')

8. Pre-compute all pairs¶

For repeated queries over the same dataset, precompute_all_pairs fills the result cache for every ordered (i, j) pair in one go.

In [ ]:

Copied!





pairs = idx.precompute_all_pairs(merge=True)
print(f'Pre-computed {len(pairs)} pairs:')
for q, t in pairs:
    print(f'  {q} vs {t}')
pairs = idx.precompute_all_pairs(merge=True)
print(f'Pre-computed {len(pairs)} pairs:')
for q, t in pairs:
    print(f'  {q} vs {t}')

9. Simple dotplot¶

The DotPlotter class wraps the index to generate publication-ready dotplots.

Both plot() and plot_single() return a matplotlib.figure.Figure. In a Jupyter notebook the figure is displayed inline automatically when you omit output_path. Pass output_path to save to disk as well.

See the Dotplot Visualization tutorial for the full list of options including SVG, PDF, and other file formats.

In [ ]:

Copied!

plotter = DotPlotter(idx)

# Inline display only — no file is written
fig = plotter.plot(title='Quick Start Dotplot — inline')
plt.close(fig)  # free memory when no longer needed
plotter = DotPlotter(idx)

# Inline display only — no file is written
fig = plotter.plot(title='Quick Start Dotplot — inline')
plt.close(fig)  # free memory when no longer needed

In [ ]:

Copied!





# Save to a PNG file and display inline
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as fh:
    out_path = fh.name

fig = plotter.plot(output_path=out_path, title='Quick Start Dotplot')
plt.close(fig)
print(f'Dotplot saved to: {out_path}')
# Save to a PNG file and display inline
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as fh:
    out_path = fh.name

fig = plotter.plot(output_path=out_path, title='Quick Start Dotplot')
plt.close(fig)
print(f'Dotplot saved to: {out_path}')

Next steps¶

Dotplot Visualization tutorial — all plot customisation options.
PAF Workflow tutorial — loading PAF files, CIGAR parsing, and contig reordering.
API Reference — complete documentation for every class and function.