Snakemake

design patterns

rule convert_svg:
    input:
        "{prefix}.svg"
    output:
        "{prefix}.{fmt,(pdf|png)}"
    conda:
        "envs/cairosvg.yml"
    shell:
        "cairosvg -f {wildcards.fmt} {input} -o {output}"

Generic conversions

Configuration

import pandas as pd
from snakemake.untils import validate


configfile: "config/config.yaml"
validate(config, schema="../schemas/config.schema.yaml")

samples = pd.read_csv(config["samples"], sep="\t", dtype=str)\
            .set_index("sample", drop=False)
validate(samples, schema="../schemas/samples.schema.yaml")

units = pd.read_csv(config["units"], dtype=str, sep="\t")\
          .set_index(["sample", "unit"], drop=False)
validate(units, schema="../schemas/units.schema.yaml")
sample	condition	batch_effect
A	treated	batch1
B	untreated	batch1
C	treated	batch2
D	untreated	batch2

samples.tsv:

units.tsv:

sample	unit	fragment_len_mean	fragment_len_sd	fq1	fq2
A	1			raw/a.chr21.1.fq	raw/a.chr21.2.fq
B	1			raw/b.chr21.1.fq	raw/b.chr21.2.fq
B	2	300	14	raw/b.chr21.1.fq	
C	1			raw/a.chr21.1.fq	raw/a.chr21.2.fq
D	1			raw/b.chr21.1.fq	raw/b.chr21.2.fq

Configuration

$schema: "http://json-schema.org/draft-04/schema#"
description: |
  row of the units.tsv, representing a sequencing unit, 
  i.e. single-end or paired-end data
type: object
properties:
  sample:
    type: string
    description: sample name/id the unit has been sequenced from
  unit:
    type: string
    description: unit id
  fq1:
    type: string
    description: path to FASTQ file
  fq2:
    type: string
    description: path to second FASTQ file (leave empty in case of single-end)
required:
  - sample
  - unit
  - fq1

units.schema.yaml:

pepfile: "pep/config.yaml"
pepschema: "schemas/pep.yaml"

  
rule all:
    input:
        expand(
            "results/{sample}.someresult.txt", 
            sample=pep.sample_table["sample_name"]
        )

Configuration with PEP

def get_samples_by_date(wildcards):
    return samples.loc[samples["date"] == wildcards.date, "sample_name"]
  
  
def get_rule_input(wildcards):
    return expand("results/someresult/{sample}.txt", sample=get_samples_by_date(wildcards))
  
  
rule a:
    input:
        get_rule_input
    output:
        "results/{date}/some-output.txt"
    params:
        samples=get_samples_by_date
    script:
        "scripts/aggregate.py"


# scripts/aggregate.py
 
for sample, infile in zip(snakemake.params.samples, snakemake.input):
    # do something

Aggregation with metadata

Use all given cores

(or a function of them)

rule map_reads:
    input:
        reads=get_merged
    output:
        "mapped/{sample}.sorted.bam"
    params:
        index="refs/genome"
    threads:
      	workflow.cores
    wrapper:
        "0.39.0/bio/bwa/mem"

Use all given cores

(or a function of them)

rule map_reads:
    input:
        reads=get_merged
    output:
        "mapped/{sample}.sorted.bam"
    params:
        index="refs/genome"
    threads:
      	workflow.cores / len(samples)
    wrapper:
        "0.39.0/bio/bwa/mem"

Piping

rule cutadapt:
    input:
        "reads/{sample}.fastq"
    output:
        fastq=pipe("trimmed/{sample}.fastq"),
        qc="trimmed/{sample}.qc.txt"
    wrapper:
        "0.45.1/bio/cutadapt/se"


rule bowtie:
    input:
        reads="trimmed/{sample}.fastq"
    output:
        "mapped/{sample}.bam"
    threads: 8
    params:
        index="refs/genome"
    wrapper:
        "0.45.1/bio/bowtie2/align"

Grouping

rule samtools_sort:
    input:
        "mapped/{sample}.bam"
    output:
        "mapped/{sample}.sorted.bam"
    group: 
         "sort-and-index"
    threads: 8
    wrapper:
        "0.45.1/bio/samtools/sort"
        

rule samtools_index:
    input:
        "mapped/{sample}.sorted.bam"
    output:
        "mapped/{sample}.sorted.bam.bai"
    group: 
        "sort-and-index"
    wrapper:
        "0.45.1/bio/samtools/index"

Grouping via CLI

rule all:
    input:
        "data.10.transformed.txt"

def get_iteration_input(wildcards):
    i = int(wildcards.i)
    if i == 0:
        return "data.txt"
    else:
        return f"data.{i-1}.transformed.txt"

rule iterate:
    input:
        get_iteration_input
    output:
        "data.{i}.transformed.txt"

Iteration

Handling references

rule get_genome:
    output:
        "refs/genome.fasta"
    params:
        species=config["ref"]["species"],
        datatype="dna",
        build=config["ref"]["build"],
        release="98"
    wrapper:
        "0.45.1/bio/reference/ensembl-sequence"

        
rule get_known_variants:
    input:
        # use fai to annotate contig lengths for GATK BQSR
        fai="refs/genome.fasta.fai"
    output:
        vcf="refs/variation.vcf.gz"
    params:
        species=config["ref"]["species"],
        release="98",
        type="all"
    wrapper:
        "0.45.1/bio/reference/ensembl-variation"


rule bwa_index:
    input:
        "refs/genome.fasta"
    output:
        multiext("refs/genome", ".amb", ".ann", ".bwt", ".pac", ".sa")
    log:
        "logs/bwa_index.log"
    params:
        prefix="refs/genome"
    wrapper:
        "0.45.1/bio/bwa/index"

Handling references (efficiently)

rule get_genome:
    output:
        "refs/genome.fasta"
    params:
        species=config["ref"]["species"],
        datatype="dna",
        build=config["ref"]["build"],
        release="98"
    cache: True
    wrapper:
        "0.45.1/bio/reference/ensembl-sequence"

        
rule get_known_variants:
    input:
        # use fai to annotate contig lengths for GATK BQSR
        fai="refs/genome.fasta.fai"
    output:
        vcf="refs/variation.vcf.gz"
    params:
        species=config["ref"]["species"],
        release="98",
        type="all"
    cache: True
    wrapper:
        "0.45.1/bio/reference/ensembl-variation"


rule bwa_index:
    input:
        "refs/genome.fasta"
    output:
        multiext("refs/genome", ".amb", ".ann", ".bwt", ".pac", ".sa")
    log:
        "logs/bwa_index.log"
    params:
        prefix="refs/genome"
    cache: True
    wrapper:
        "0.45.1/bio/bwa/index"

Parameter spaces

from snakemake.utils import Paramspace
import pandas as pd


paramspace = Paramspace(pd.read_csv("params.tsv", sep="\t"))


rule all:
    input:
        expand(
            "results/simulations/{params}.pdf",
            params=paramspace.instance_patterns
        )


rule simulate:
    output:
        f"results/simulations/{paramspace.wildcard_pattern}.tsv"
    params:
        simulation=paramspace.instance

Scatter-gather

scattergather:
    someprocess=8


rule scatter:
    output:
        scatter.someprocess("scattered/{scatteritem}.txt")
    ...


rule step2:
    input:
        "scattered/{scatteritem}.txt"
    output:
        "transformed/{scatteritem}.txt"
    ...


rule gather:
    input:
        gather.someprocess("transformed/{scatteritem}.txt")
    ...

Split-Apply-Combine (data-dependent)

checkpoint clustering:
    input:
        "samples/{sample}.txt"
    output:
        clusters=directory("clustering/{sample}")
    shell:
        "cluster {input} {output}"


# an intermediate rule
rule intermediate:
    input:
        "clustering/{sample}/{i}.txt"
    output:
        "post/{sample}/{i}.txt"
    shell:
        "do-something {input} {output}"


def aggregate_input(wildcards):
    checkpoint_output = checkpoints.clustering.get(**wildcards).output[0]
    return expand("post/{sample}/{i}.txt",
           sample=wildcards.sample,
           i=glob_wildcards(os.path.join(checkpoint_output, "{i}.txt")).i)


# an aggregation over all produced clusters
rule aggregate:
    input:
        aggregate_input
    output:
        "plots/{sample}.pdf"
    script:
        "scripts/plot-clusters.py"

Data-dependent branch selection

checkpoint somestep:
    input:
        "samples/{sample}.txt"
    output:
        "somestep/{sample}.txt"
    shell:
        "echo {wildcards.sample} > somestep/{wildcards.sample}.txt"


def aggregate_input(wildcards):
    with checkpoints.somestep.get(sample=wildcards.sample).output[0].open() as f:
        if f.read().strip() == "a":
            return "branch_a/{sample}.txt"
        else:
            return "branch_b/{sample}.txt"


rule aggregate:
    input:
        aggregate_input
    output:
        "aggregated/{sample}.txt"
    shell:
        "touch {output}"

Data-dependent branch selection

# intermediate rule
rule intermediate:
    input:
        "somestep/{sample}.txt"
    output:
        "branch_a/{sample}.txt"
    shell:
        "touch {output}"


# alternative intermediate rule
rule alt_intermediate:
    input:
        "somestep/{sample}.txt"
    output:
        "branch_b/{sample}.txt"
    shell:
        "touch {output}"

QC-based sample filtering

def get_results(wildcards):
    qc = pd.read_csv(checkpoints.qc.get().output[0].open(), sep="\t")
    return expand(
        "results/processed/{sample}.txt", 
        sample=qc[qc["mapping-rate"] > 90.0]["sample"]
    )


rule all:
    input:
        get_results


checkpoint qc:
    input:
        expand("results/preprocessed/{sample}.txt", sample=samples)
    output:
        "results/qc.tsv"
    shell:
        "perfom-qc {input} > {output}"


rule process:
    input:
        "results/preprocessed/{sample}.txt"
    output:
      	"results/processed/{sample.txt}"
    shell:
      	"process {input} > {output}"

Snakemake design patterns

By Johannes Köster

Loading comments...

More from Johannes Köster