Johannes Köster
University of Duisburg-Essen
2025
Reproducibility
Transparency
Adaptability
>1.6 million downloads since 2015
>3000 citations
>14 citations per week in 2024
dataset
results
dataset
dataset
dataset
dataset
dataset
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
script:
"scripts/myscript.R"
rule myfiltration:
input:
"result/{dataset}.txt"
output:
"result/{dataset}.filtered.txt"
shell:
"mycommand {input} > {output}"
rule aggregate:
input:
"results/dataset1.filtered.txt",
"results/dataset2.filtered.txt"
output:
"plots/myplot.pdf"
script:
"scripts/myplot.R"
rule mytask:
input:
"data/{sample}.txt"
output:
"result/{sample}.txt"
shell:
"some-tool {input} > {output}"
rule name
how to create output from input
define
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
script:
"scripts/myscript.R"
rule myfiltration:
input:
"result/{dataset}.txt"
output:
"result/{dataset}.filtered.txt"
shell:
"mycommand {input} > {output}"
rule aggregate:
input:
"results/dataset1.filtered.txt",
"results/dataset2.filtered.txt"
output:
"plots/myplot.pdf"
script:
"scripts/myplot.R"
rule mytask:
input:
"data/{sample}.txt"
output:
"result/{sample}.txt"
script:
"scripts/myscript.py"
reusable scripts:
import pandas as pd
data = pd.read_table(snakemake.input[0])
data = data.sort_values("id")
data.to_csv(snakemake.output[0], sep="\t")Python:
data <- read.table(snakemake@input[[1]])
data <- data[order(data$id),]
write.table(data, file = snakemake@output[[1]])R:
import polar as pl
pl.read_csv(&snakemake.input[0])
.sort()
.to_csv(&snakemake.output[0])Rust:
$XONSH_TRACEBACK_LOGFILE = snakemake.log[0]
xsv sort -s id @(snakemake.input[0]) > @(snakemake.output[0])Xonsh:
(import pandas :as pd)
(setv data (pd.read_table (get snakemake.input 0)))
(setv data (.sort_values data "id"))
(.to_csv data (get snakemake.output 0) :sep "\t")Hy:
#!/usr/bin/env bash
exec 2> "${snakemake_log[0]}"
xsv sort -s id ${snakemake_input[0]} > ${snakemake_output[0]}Bash:
new!
new!
rule map_reads:
input:
"{sample}.bam"
output:
"{sample}.sorted.bam"
wrapper:
"0.22.0/bio/samtools/sort"
reuseable wrappers from central repository
job selection
job resource usage
free resources
job temp file consumption
temp file lifetime fraction
job priority
job thread usage
temp file size
temp file deletion
--groups a=g1 b=g1
--groups a=g1 b=g1 --group-components g1=2
--groups a=g1 b=g1 --group-components g1=5
snakemake --default-resources --jobs 100mem_mb="min(max(2*input.size_mb, 1000), 8000)"
disk_mb="max(2*input.size_mb, 1000) if input else 50000"snakemake --default-resources mem_mb=... --jobs 100Specifiy defaults:
Use builtin defaults:
Store in profile:
# /etc/xdg/snakemake/default/config.yaml:
default-resources:
mem_mb: "min(max(2*input.size_mb, 1000), 8000)"
...Define per rule:
# profiles/default/config.yaml:
set-resources:
mytask:
mem_mb: 16000
...workstation
compute server
cluster
grid computing
cloud computing
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
conda:
"envs/some-tool.yaml"
shell:
"some-tool {input} > {output}"
channels:
- conda-forge
dependencies:
- some-tool =2.3.1
- some-lib =1.1.2
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
container:
"docker://biocontainers/some-tool:2.3.1"
shell:
"some-tool {input} > {output}"
rule chipseq_pipeline:
input:
input="design.csv",
fasta="data/genome.fasta",
gtf="data/genome.gtf",
# any --<argname> pipeline file arguments can be given here as <argname>=<path>
output:
report="results/multiqc/broadPeak/multiqc_report.html",
params:
pipeline="nf-core/chipseq",
revision="2.0.0",
profile=["test", "docker"],
outdir=subpath(output.report, ancestor=2),
# any --<argname> pipeline arguments can be given here as <argname>=<value>
handover: True
wrapper:
"v7.2.0/utils/nextflow"
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
params:
some_threshold=lookup(
dpath="some_tool/thresholds/{dataset}",
within=config,
default=0.1
)
shell:
"some-tool {input} > {output}"new!
rule mytask:
input:
"path/to/{dataset}.txt"
output:
"result/{dataset}.txt"
params:
some_threshold=lookup(
query="dataset == '{dataset}'",
cols="threshold",
within=sheet
)
shell:
"some-tool {input} > {output}"new!
rule mytask:
input:
branch(
lookup(dpath="prefilter/activate", within=config),
then="results/preprocessed/{dataset}",
otherwise="path/to/{dataset}"
)
output:
"result/{dataset}.txt"
shell:
"some-tool {input} > {output}"new!
rule mytask:
input:
branch(
lookup(dpath="prefilter/activate", within=config),
then="results/preprocessed/{dataset}",
otherwise="path/to/{dataset}"
)
output:
foo="result/{dataset}.txt"
params:
outdir=subpath(output.foo, parent=True)
shell:
"some-tool {input} -o {params.outdir}"new!
rule mytask:
input:
branch(
lookup(dpath="prefilter/activate", within=config),
then="results/preprocessed/{dataset}",
otherwise="path/to/{dataset}"
)
output:
foo="result/{dataset}.txt"
params:
prefix=subpath(output.foo, strip_suffix=".txt")
shell:
"some-tool {input} -o {params.prefix}"new!
rule mytask:
input:
"<resources>/{dataset}.txt"
output:
"<results>/{dataset}.txt"
shell:
"some-tool {input} > {output}"new!
# config.yaml
pathvars:
results: some/custom/path/results
resources: some/custom/path/resources
module star_arriba:
meta_wrapper: "v9.0.1/meta/bio/star_arriba"
pathvars:
results="...", # Path to results directory
resources="...", # Path to resources directory
logs="...", # Path to logs directory
genome_sequence="...", # Path to FASTA file with genome sequence
genome_annotation="...", # Path to GTF file with genome annotation
reads_r1="...", # Path/pattern for FASTQ files with R1 reads
reads_r2="...", # Path/pattern for FASTQ files with R2 reads
per="...", # Pattern for sample identifiers, e.g. ``"{sample}"``
use rule * from star_arriba as star_arriba_*new!
inputflags:
access.sequential
rule mytask:
input:
access.random("<resources>/{dataset}.txt")
output:
"<results>/{dataset}.txt"
shell:
"some-tool {input} > {output}"new!
from dataclasses import dataclass, field
from snakemake_interface_common.exceptions import WorkflowError
from snakemake_interface_report_plugins.reporter import ReporterBase
from snakemake_interface_report_plugins.settings import ReportSettingsBase
# Optional:
# Define additional settings for your reporter.
# They will occur in the Snakemake CLI as --report-<reporter-name>-<param-name>
# Omit this class if you don't need any.
# Make sure that all defined fields are Optional (or bool) and specify a default value
# of None (or False) or anything else that makes sense in your case.
@dataclass
class ReportSettings(ReportSettingsBase):
myparam: Optional[int] = field(
default=None,
metadata={
"help": "Some help text",
# Optionally request that setting is also available for specification
# via an environment variable. The variable will be named automatically as
# SNAKEMAKE_REPORT_<reporter-name>_<param-name>, all upper case.
# This mechanism should ONLY be used for passwords and usernames.
# For other items, we rather recommend to let people use a profile
# for setting defaults
# (https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles).
"env_var": False,
# Optionally specify a function that parses the value given by the user.
# This is useful to create complex types from the user input.
"parse_func": ...,
# If a parse_func is specified, you also have to specify an unparse_func
# that converts the parsed value back to a string.
"unparse_func": ...,
# Optionally specify that setting is required when the reporter is in use.
"required": True,
# Optionally specify multiple args with "nargs": True
},
)
# Required:
# Implementation of your reporter
class Reporter(ReporterBase):
def __post_init__(self):
# initialize additional attributes
# Do not overwrite the __init__ method as this is kept in control of the base
# class in order to simplify the update process.
# In particular, the settings of above ReportSettings class are accessible via
# self.settings.
def render(self):
# Render the report, using attributes of the base class.
...Snakemake covers all aspects of fully reproducible, transparent, and adaptable data analysis, offering
https://snakemake.github.io