Rust Library Cookbook
This guide provides practical examples for using grit_genomics as a Rust library in your own projects.
Getting Started
Add to your Cargo.toml:
[dependencies]
grit-genomics = "0.1"
Basic Usage
Reading BED Files
use grit_genomics::prelude::*;
use grit_genomics::bed::{read_intervals, parse_intervals, BedReader};
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Read all intervals from a file
let intervals = read_intervals("regions.bed")?;
println!("Loaded {} intervals", intervals.len());
// Parse from a string
let content = "chr1\t100\t200\nchr1\t300\t400\n";
let intervals = parse_intervals(content)?;
// Streaming reader for large files
let mut reader = BedReader::from_path("large.bed")?;
while let Some(record) = reader.read_record()? {
println!("{}\t{}\t{}", record.chrom, record.start, record.end);
}
Ok(())
}
Working with Intervals
use grit_genomics::interval::{Interval, Strand};
fn main() {
// Create an interval
let iv = Interval::new("chr1", 100, 200);
// With strand information
let iv_stranded = Interval::with_strand("chr1", 100, 200, Strand::Forward);
// Check overlap
let other = Interval::new("chr1", 150, 250);
if iv.overlaps(&other) {
println!("Intervals overlap by {} bp", iv.overlap_length(&other));
}
// Calculate distance (returns None if overlapping)
let distant = Interval::new("chr1", 300, 400);
if let Some(dist) = iv.distance_to(&distant) {
println!("Distance: {} bp", dist);
}
}
Command Examples
Intersect: Find Overlapping Intervals
use grit_genomics::prelude::*;
use grit_genomics::bed::parse_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let a = parse_intervals("chr1\t100\t200\nchr1\t300\t400\n")?;
let b = parse_intervals("chr1\t150\t250\nchr1\t350\t450\n")?;
// Build index for efficient lookups
let b_index = IntervalIndex::from_intervals(b);
// Find intersections
let cmd = IntersectCommand::new();
let results = cmd.find_intersections(&a, &b_index);
for interval in results {
println!("{}\t{}\t{}", interval.chrom, interval.start, interval.end);
}
Ok(())
}
Intersect with Options
use grit_genomics::commands::IntersectCommand;
let mut cmd = IntersectCommand::new();
cmd.write_a = true; // Include original A interval
cmd.write_b = true; // Include original B interval
cmd.fraction = Some(0.5); // Require 50% overlap
cmd.reciprocal = true; // Require reciprocal overlap
cmd.unique = true; // Report each A only once
Merge: Combine Overlapping Intervals
use grit_genomics::prelude::*;
use grit_genomics::bed::parse_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let intervals = parse_intervals(
"chr1\t100\t200\nchr1\t150\t250\nchr1\t300\t400\n"
)?;
let cmd = MergeCommand::new();
let merged = cmd.merge(intervals);
// Result: chr1:100-250 and chr1:300-400
assert_eq!(merged.len(), 2);
Ok(())
}
Merge with Distance
use grit_genomics::commands::MergeCommand;
let mut cmd = MergeCommand::new();
cmd.distance = 100; // Merge intervals within 100bp of each other
Subtract: Remove Overlapping Regions
use grit_genomics::prelude::*;
use grit_genomics::bed::parse_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let a = parse_intervals("chr1\t100\t300\n")?;
let b = parse_intervals("chr1\t150\t200\n")?;
let b_index = IntervalIndex::from_intervals(b);
let cmd = SubtractCommand::new();
let results = cmd.subtract(&a, &b_index);
// Result: chr1:100-150 and chr1:200-300
assert_eq!(results.len(), 2);
Ok(())
}
Closest: Find Nearest Intervals
use grit_genomics::prelude::*;
use grit_genomics::bed::parse_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let a = parse_intervals("chr1\t100\t200\n")?;
let b = parse_intervals("chr1\t300\t400\nchr1\t500\t600\n")?;
let b_index = IntervalIndex::from_intervals(b);
let cmd = ClosestCommand::new();
let results = cmd.find_closest(&a, &b_index);
// Each result is (a_interval, closest_b_interval, distance)
for (a_iv, b_iv, dist) in results {
println!("{} closest to {} (distance: {})",
a_iv.start, b_iv.start, dist);
}
Ok(())
}
Sort: Order Intervals
use grit_genomics::prelude::*;
use grit_genomics::bed::parse_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut intervals = parse_intervals(
"chr2\t100\t200\nchr1\t300\t400\nchr1\t100\t200\n"
)?;
let cmd = SortCommand::new();
cmd.sort(&mut intervals);
// Now sorted: chr1:100-200, chr1:300-400, chr2:100-200
Ok(())
}
Streaming Operations
For large files that don’t fit in memory, use streaming commands:
Streaming Intersect
use grit_genomics::commands::StreamingIntersectCommand;
use std::io::{BufWriter, stdout};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = StreamingIntersectCommand::new();
cmd.assume_sorted = true; // Skip validation for speed
let mut output = BufWriter::new(stdout());
cmd.run("a.bed", "b.bed", &mut output)?;
Ok(())
}
Streaming Merge
use grit_genomics::commands::StreamingMergeCommand;
use std::io::{BufWriter, stdout};
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = StreamingMergeCommand::new();
cmd.distance = 100;
let mut output = BufWriter::new(stdout());
let stats = cmd.run("input.bed", &mut output)?;
eprintln!("Merged {} intervals into {}",
stats.input_count, stats.output_count);
Ok(())
}
Building an Index
For repeated queries against the same dataset:
use grit_genomics::prelude::*;
use grit_genomics::bed::read_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Load and index once
let features = read_intervals("features.bed")?;
let index = IntervalIndex::from_intervals(features);
// Query multiple times
let query = Interval::new("chr1", 1000, 2000);
let overlapping: Vec<_> = index.find_overlapping(&query).collect();
println!("Found {} overlapping features", overlapping.len());
Ok(())
}
Working with BED Records
BedRecord preserves all columns from the input:
use grit_genomics::bed::BedReader;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut reader = BedReader::from_path("genes.bed")?;
while let Some(record) = reader.read_record()? {
// Core fields
println!("{}:{}-{}", record.chrom, record.start, record.end);
// Optional fields (if present)
if let Some(name) = &record.name {
println!(" Name: {}", name);
}
if let Some(score) = record.score {
println!(" Score: {}", score);
}
if let Some(strand) = &record.strand {
println!(" Strand: {:?}", strand);
}
// Additional fields preserved as raw string
if let Some(rest) = &record.rest {
println!(" Extra: {}", rest);
}
}
Ok(())
}
Error Handling
use grit_genomics::bed::{BedError, BedReader};
fn main() {
match BedReader::from_path("nonexistent.bed") {
Ok(reader) => { /* use reader */ }
Err(BedError::Io(e)) => {
eprintln!("File error: {}", e);
}
Err(BedError::Parse { line, message }) => {
eprintln!("Parse error at line {}: {}", line, message);
}
Err(BedError::InvalidFormat(msg)) => {
eprintln!("Invalid format: {}", msg);
}
}
}
Parallel Processing
For maximum speed on multi-core systems:
use grit_genomics::prelude::*;
use grit_genomics::bed::read_intervals;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let a = read_intervals("a.bed")?;
let b = read_intervals("b.bed")?;
let cmd = IntersectCommand::new();
// Parallel processing using Rayon
let results = cmd.find_intersections_parallel(a, b);
println!("Found {} intersections", results.len());
Ok(())
}
Complete Example: ChIP-seq Peak Analysis
use grit_genomics::prelude::*;
use grit_genomics::bed::read_intervals;
use grit_genomics::commands::SlopCommand;
fn main() -> Result<(), Box<dyn std::error::Error>> {
// Load peaks and genes
let peaks = read_intervals("peaks.bed")?;
let genes = read_intervals("genes.bed")?;
// Extend gene promoters (2kb upstream)
let slop_cmd = SlopCommand::new();
let promoters = slop_cmd.slop_intervals(&genes, 2000, 0, "genome.txt")?;
// Build index for promoters
let promoter_index = IntervalIndex::from_intervals(promoters);
// Find peaks in promoters
let intersect_cmd = IntersectCommand::new();
let peaks_in_promoters = intersect_cmd.find_intersections(&peaks, &promoter_index);
println!("Found {} peaks in promoter regions", peaks_in_promoters.len());
// Merge nearby peaks
let mut merge_cmd = MergeCommand::new();
merge_cmd.distance = 500; // Within 500bp
let merged_peaks = merge_cmd.merge(peaks_in_promoters);
println!("Merged into {} peak regions", merged_peaks.len());
Ok(())
}
Tips
-
Use streaming for large files: If your files exceed available RAM, use
StreamingIntersectCommandetc. -
Pre-sort for streaming: Streaming commands require sorted input. Sort once with
SortCommandorgrit sort. -
Build indices for repeated queries: If querying the same dataset multiple times, build an
IntervalIndexonce. -
Release the GIL in Python: The library automatically releases Python’s GIL during computation, enabling true parallelism.
-
Check stats: Many streaming commands return statistics about processing.