Johannes Köster
July 2016
combines computer science, statistics, mathematics to interpret biological data
Involves:
Mathematical or statistical modelling, machine learning, ...
Almost always:
deal with biological sequences
encodes the genome, the "blueprint" of any organism
Text over the alphabet ACGT
building blocks of cells
Text over the alphabet ARNDCEQGHILKMFPSTWYV
Examples:
Techniques:
Big data:
Wanted:
Reality:
Idea:
common Bioinformatics algorithms and data structures as a library
Features:
ACGGATGCGATCGGAGCTGAGCTGCGATCGATGCGATCG ||| ||| |||||||| CGA---GAGTTGAGCTGC
Given:
sequence A and B
Find:
combination of substitutions, deletions and insertions with maximal score / minimal cost that transforms B into A
Variants:
Implementation:
// two texts
let x = b"ACCGTGGAT";
let y = b"AAAAACCGTTGAT";
// scoring
let score = |a: u8, b: u8| if a == b {1i32} else {-1i32};
let gap_open = -5;
let gap_extend = -1;
// init aligner
let mut aligner = Aligner::with_capacity(x.len(), y.len(), gap_open, gap_extend, &score);
// perform alignment
let alignment = aligner.semiglobal(x, y);
// results
alignment.pretty()
// output:
// AAAAACCGTTGAT
// ||||| |||
// ----ACCGTGGAT
alignment.operations
// output:
// [Match, Match, Match, Match, Match, Subst, Match, Match, Match]
alignment.cigar()
// output:
// 5=1X3=
pub struct Aligner<'a, F>
where F: 'a + Fn(u8, u8) -> i32
{
// dynamic programming matrix columns
S: [Vec<i32>; 2],
I: [Vec<i32>; 2],
D: [Vec<i32>; 2],
// traceback matrix (as bitvectors)
S_traceback: Traceback,
I_traceback: Traceback,
D_traceback: Traceback,
// scoring
gap_open: i32,
gap_extend: i32,
// function for custom substitution costs
score: &'a F,
}
Avoid reallocation:
aligner can be reused multiple times
macro_rules! align {
(
$aligner:ident, $x:ident, $y:ident, $state:ident,
$init:block, $inner:block, $outer:block, $ret:block
) => (
{
let mut $state = AlignmentState { ... };
// iterate over columns
while $state.i <= $state.n {
// init block
$init
// read next y symbol
let b = $y[$state.i - 1];
$state.j = 1;
// iterate over rows
while $state.j <= $state.m {
// read next x symbol
let a = $x[$state.j - 1];
// calculate scores for substitution, insertion and deletion
...
// inner block
$inner
// update dynamic programming matrix with best score
$aligner.S[$state.col][$state.j] = $state.score;
$state.j += 1;
}
// outer block
$outer
$state.i += 1;
}
// return code
$ret
}
);
}
align!(
self,
x,
y,
state,
{
// init block (before each column)
self.S[state.col][0] = i32::MIN - self.gap_open;
self.I[state.col][0] = i32::MIN - self.gap_open;
self.D[state.col][0] = self.gap_open + (state.i as i32 - 1) * self.gap_extend;
self.S_traceback.del(state.i, 0);
self.D_traceback.del(state.i, 0);
self.I_traceback.del(state.i, 0);
},
{
// inner block (after each row)
},
{
// outer block (after each column)
},
{
self.alignment(state.n, state.m, x, y, state.score)
}
)
align!(
self,
x,
y,
state,
{
// init block (before each column)
self.S[state.col][0] = 0;
},
{
// inner block (after each row)
},
{
// outer block (after each column)
if state.score > state.best || state.best_j != state.m {
state.best = state.score;
state.best_i = state.i;
state.best_j = state.m;
}
},
{
self.alignment(state.best_i, state.best_j, x, y, state.best)
}
)
align!(
self,
x,
y,
state,
{
// init block (before each column)
self.S[state.col][0] = 0;
},
{
// inner block (after each row)
if state.score < 0 {
self.S_traceback.start(state.i, state.j);
state.score = 0;
} else if state.score > state.best {
state.best = state.score;
state.best_i = state.i;
state.best_j = state.j;
}
},
{
// outer block (after each column)
},
{
self.alignment(state.best_i, state.best_j, x, y, state.best)
}
)
use bio::data_structures::bitenc::BitEnc;
// two bit encoding
let mut bitenc = BitEnc::new(2);
// push values
bitenc.push(0);
bitenc.push(2);
bitenc.push(1);
// random access
bitenc.get(1)
// output:
// Some(2u8)
// iterate
bitenc.iter().collect();
// output:
// [0u8, 2u8, 1u8]
use bio::data_structures::smallints::SmallInts;
// init
let mut smallints: SmallInts<u8, u64> = SmallInts::new();
// add values
smallints.push(3);
smallints.push(4);
smallints.push(255);
smallints.push(305093);
// random access
smallints.get(0)
// output:
// Some(3u64)
// iteration
smallints.iter().collect()
// output:
// [3u64, 4u64, 255u64, 305093u64]
pub struct SmallInts<S: Integer + Bounded + NumCast + Copy, B: Integer + NumCast + Copy> {
smallints: Vec<S>,
bigints: BTreeMap<usize, B>,
}
impl<...> SmallInts<S, B> {
fn real_value(&self, i: usize, v: S) -> Option<B> {
if v < S::max_value() {
cast(v)
} else {
self.bigints.get(&i).cloned()
}
}
}
extern crate bit_vec;
use bio::data_structures::rank_select::RankSelect;
use bit_vec::BitVec;
// init
let mut bits = BitVec::from_elem(64, false);
bits.set(5, true);
bits.set(32, true);
let rs = RankSelect::new(bits, 1);
// query rank of bit
rs.rank(5)
// output:
// Some(1)
// select first bit with given rank
rs.select(2)
// output:
// Some(32)
0000100000000000000000000000000100 0000111111111111111111111111111222
bits:
ranks:
Types:
Parallelization:
Contributors:
Christopher Schröder
Peer Aramillo Irizar
Fedor Gusev
Vadim Nazarov
Brad Chapman
Florian Gilcher
Erik Clarke
Rizky Luthfianto
Adam Perry
Taylor Cramer
Andre Bogus
Martin Larralde
Philipp Angerer
Pierre Marijon
Franklin Delehelle
Crates:
itertools
csv
nalgebra
approx
serde