21st April 2026

FLOW MATCHING FOR GENERATIVE MODELING

https://openreview.net/pdf?id=PqvMRDCJT9t

Autonomous Driving

Audio

Image/Video

Protein Structure

Generation

https://arxiv.org/pdf/2510.11083
https://flowception-meta.github.io
https://voicebox.metademolab.com
https://arxiv.org/pdf/2310.02391

Introduction

https://arxiv.org/pdf/2412.06264

Introduction

https://arxiv.org/pdf/2412.06264

Introduction

https://arxiv.org/pdf/2412.06264

Preliminaries

\begin{aligned} &v_t: \textbf{Vector Field}\\ &\phi_t: \textbf{Diffeomorphic map}\\ \\ &v_t \text{ can be used to construct a time-dependent diffeomorphic map,} \\ &\phi_t \text{ using ODE:} \\ \\ &\frac{d}{dt}\phi_t(x) = v_t(\phi_t(x))\\ &\phi_0(x) = x \\ \\ &\textbf{push-forward equation}\\ &p_t = [\phi_t]_*p_0\\ &[\phi_t]_*p_0(x) = p_0(\phi_t^{-1}(x)) \det\left(\frac{\partial \phi_t^{-1}}{\partial x}(x)\right) \end{aligned}

Preliminaries

\begin{aligned} &\textbf{Continuity Equation}\\ &\text{If } v_t \text{ generates } p_t \text{, then } p_t \text{ and } v_t \text{ satisfy:}\\ &\boxed{\frac{\partial}{\partial t} p_t(x) + \nabla \cdot \big( v_t(x) \, p_t(x) \big) = 0}\\ &\text{when it satisfies: } p_t = [\phi_t]_* \, p_0 \end{aligned}

Lets Go...

Data

p_0 = p ; \text{ Simple Distribution} \\ p_1 = q ; \text{ Data Distribution} \\

https://arxiv.org/pdf/2412.06264

p can be some simple distributions e.g.:

Path Design

\begin{aligned} &p_0 = p ; \text{ Simple Distribution} \\ &p_1 = q ; \text{ Data Distribution} \\ \end{aligned}

p_t \text{ probability path s.t. } 0 \le t \le 1

p = \mathcal{N}(x|0, I)

https://arxiv.org/pdf/2412.06264

Training

\begin{aligned} &\boxed{\mathcal{L}_{\text{FM}}(\theta) = \mathbb{E}_{t, p_t(x)} \|v_t(x) - u_t(x)\|^2}\\ \\ &\text{where,} \\ &\theta \text{ learnable param of the CNF vector field } v_t \\ &t \sim \mathcal{U}[0,1] \\ &x \sim p_t(x) \end{aligned}

https://arxiv.org/pdf/2412.06264

Training

\begin{aligned} &\textbf{\color{red}FM Loss is Intractable} \\ &\text{Target vector field: } u_t(x) = \int u_t(x \mid x_1) \frac{p_t(x \mid x_1)\, q(x_1)}{p_t(x)}\, dx_1\\ &\text{Reason:}\\ &1. q(x_1): \text{true data distribution is \textbf{unknown}}\\ &2. p_t(x) = \int p_t(x \mid x_1) q(x_1) dx_1 \textbf{marginal is intractable}\\ \end{aligned}

https://arxiv.org/pdf/2412.06264

Training

\begin{aligned} &\textbf{FM Loss is Intractable} \\ &\text{Target vector field: } u_t(x) = \int u_t(x \mid x_1) \frac{p_t(x \mid x_1)\, q(x_1)}{p_t(x)}\, dx_1\\ &\text{Reason:}\\ &1. q(x_1): \text{true data distribution is \textbf{unknown}}\\ &2. p_t(x) = \int p_t(x \mid x_1) q(x_1) dx_1 \textbf{marginal is intractable}\\ \end{aligned}

\text{\color{green}Solution: Conditional Flow Matching}

https://arxiv.org/pdf/2412.06264

Idea

Mix all the easy conditional paths together, to get hard marginal path for free.

Constructing p_t, u_t

\begin{aligned} &x_1 \sim q(x); \quad \text{sample from data distribution} \\ &p_0(x|x_1) = p(x) \\ &p_1(x|x_1) = \mathcal{N}(x|x_1, \sigma^{2} I) \quad \text{small } \sigma > 0\\ \end{aligned}

Constructing p_t, u_t

\begin{aligned} &x_1 \sim q(x); \quad \text{sample from data distribution} \\ &p_0(x|x_1) = p(x) \\ &p_1(x|x_1) = \mathcal{N}(x|x_1, \sigma^{2} I) \quad \text{small } \sigma > 0\\ \end{aligned}

\begin{aligned} &\textbf{From Marginalization:} \\ &p_t(x) = \int p_t(x|x_1) q(x_1) dx_1 \\ &p_1(x) = \int p_1(x|x_1) q(x_1) dx_1 \approx q(x)\\ \end{aligned}

Constructing p_t, u_t

\begin{aligned} &x_1 \sim q(x); \quad \text{sample from data distribution} \\ &p_0(x|x_1) = p(x) \\ &p_1(x|x_1) = \mathcal{N}(x|x_1, \sigma^{2} I) \quad \text{small } \sigma > 0\\ \end{aligned}

\begin{aligned} &\textbf{From Marginalization:} \\ &\boxed{p_t(x) = \int p_t(x|x_1) q(x_1) dx_1}\\ &p_1(x) = \int p_1(x|x_1) q(x_1) dx_1 \approx q(x)\\ \end{aligned}

\begin{aligned} &\textbf{On solving the Continuity Equation:}\\ &\frac{\partial p_t(x)}{\partial t} + \nabla \cdot \left( p_t(x) \, u_t(x) \right) = 0 \text{ we get,} \\ &\boxed{u_t(x) = \int u_t(x|x_1) \frac{p_t(x|x_1) q(x_1)}{p_t(x)} \, dx_1} \quad \\ \end{aligned}

Theorem 1

Conditional Flow Matching

\boxed{\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p(x_0)} \left\| v_t(\psi_t(x_0)) - u_t(x|x_1) \right\|^2}

\begin{aligned} &\text{where,} \\ &t \sim \mathcal{U}[0, 1] \\ &x_1 \sim q(x_1) \\ &x \sim p_t(x|x_1) \end{aligned}

Conditional Flow Matching

\boxed{\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p(x_0)} \left\| v_t(\psi_t(x_0)) - u_t(x|x_1) \right\|^2}

\boxed{\nabla \mathcal{L}_{FM}{(\theta)} =\nabla \mathcal{L}_{CFM}{(\theta)}}

Theorem 2

\begin{aligned} &\text{where,} \\ &t \sim \mathcal{U}[0, 1] \\ &x_1 \sim q(x_1) \\ &x \sim p_t(x|x_1) \end{aligned}

Conditional Probability Path & Vector Fields

\begin{aligned} &p_t(x|x_1) = \mathcal{N}(x \mid \mu_t(x_1), \sigma_t(x_1)^2 I) \\ &\text{where,} \\ &\mu_0{(x_1)} = 0, \quad \sigma_0{(x_1)} = 1 \rightarrow p_0(x|x_1) = \mathcal{N}{(0, I)} \\ &\mu_1{(x_1)} = x_1, \quad \sigma_1{(x_1)} = \sigma_{min} \approx 0 \rightarrow p_1(x|x_1) = \mathcal{N}(x_1, \sigma_{\min}^2 I)\\ \\ &\text{Sample from } p_t(x|x_1) \text{ can be written as:}\\ &\psi_t(x_0) = \sigma_t(x_1) \, x_0 + \mu_t(x_1); \text{\color{blue}Flow Map}\\ &\text{Flow moves particles from their starting position to where they are at time } t \\ &\text{where } x_0 \sim \mathcal{N}(0, I) \end{aligned}

\psi_t \text{pushes noise distribution} p_0(x|x_1) = p(x) \text{ to } p_t(x|x_1) \\ [\psi_t]_*p(x) = p_t(x|x_1)

Conditional Flow Matching works with any conditional probability path

Conditional Probability Path & Vector Fields

Conditional Flow Matching works with any conditional probability path

Conditional Probability Path & Vector Fields

Conditional Flow Matching works with any conditional probability path

\begin{aligned} &\text{Sample from } p_t(x|x_1) \text{ can be written as:}\\ &\boxed{\psi_t(x_0) = \sigma_t(x_1) \, x_0 + \mu_t(x_1)}; \text{\color{blue}Flow Map}\\\\ \end{aligned}

Conditional Probability Path & Vector Fields

\begin{aligned} &\frac{d}{dt}\psi_t(x) = u_t(\psi_t(x)|x_1)\\ \\ &\text{On substituting values in } \mathcal{L}_{CFM} \\ &\boxed{\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p(x_0)} \left\| v_t(\psi_t(x_0)) - \frac{d}{dt}\psi_t(x) \right\|^2} \end{aligned}

Conditional Probability Path & Vector Fields

Theorem 3

\begin{aligned} &\text{On differentiating } \psi_t(x_0) = \sigma_t(x_1) \, x_0 + \mu_t(x_1) \\ &\text{ and substituting the value } \frac{d}{dt}\psi_t(x) = u_t(\psi_t(x)|x_1) \\ &\boxed{u_t(x|x_1) = \frac{\sigma_t'(x_1)}{\sigma_t(x_1)}(x - \mu_t(x_1)) + \mu_t'(x_1)} \\ \end{aligned}

Special Instances

\begin{aligned} &\textbf{Variance Exploding Path:}\\ &p_t(x|x_1) = \mathcal{N}(x \mid x_1, \sigma_{1-t}^2 I) \\ &u_t(x|x_1) = -\frac{\sigma_{1-t}'}{\sigma_{1-t}}(x - x_1) \\ \\ &\textbf{Variance Preserving Path:}\\ &p_t(x|x_1) = \mathcal{N}\left(x \mid \alpha_{1-t} x_1, \left(1 - \alpha_{1-t}^2\right) I\right), \\ &\text{where } \alpha_t = e^{-\frac{1}{2}T(t)}, \quad T(t) = \int_0^t \beta(s) \, ds \\ &u_t(x|x_1) = \frac{\alpha_{1-t}'}{1 - \alpha_{1-t}^2}(\alpha_{1-t} x - x_1) \\ &= -\frac{T'(1-t)}{2}\left[\frac{e^{-T(1-t)}x - e^{-\frac{1}{2}T(1-t)}x_1}{1 - e^{-T(1-t)}}\right] \\ \end{aligned}

\begin{aligned} &\textbf{Optimal Transport Path:}\\ &\mu_t(x_1) = t \, x_1, \quad \sigma_t(x_1) = 1 - (1 - \sigma_{\min})t \\ \\ &\text{Conditional probability path:} \\ &p_t(x|x_1) = \mathcal{N}\left(x \mid t \, x_1, \, \left(1 - (1-\sigma_{\min})t\right)^2 I\right) \\ \\ &\text{Conditional vector field (from Theorem 3):} \\ &u_t(x|x_1) = \frac{x_1 - (1 - \sigma_{\min})x}{1 - (1 - \sigma_{\min})t} \\ \end{aligned}

Diffusion Conditional

Optimal Transport

Comparison

Comparison

Optimal Transport

Diffusion Conditional

Results

DEMO

Train

for x1 in dataloader:
    x0 = torch.randn_like(x1)               # sample noise
    t = torch.rand(batch_size, 1)            # sample time

    xt = sigma_t * x0 + mu_t * x1           # depends on p_t choice
    ut = d_mu_t * x1 + d_sigma_t * x0       # corresponding target velocity

    loss = ((model(t, xt) - ut) ** 2).mean() # MSE loss
    loss.backward()
    optimizer.step()

Inference

@torch.no_grad()
def generate(model, shape, steps=100):
    x = torch.randn(shape)                   # x0 ~ N(0, I)
    dt = 1.0 / steps

    for i in range(steps):
        t = torch.full((shape[0], 1), i * dt)
        v = model(t, x)                      # predict velocity
        x = ODEStep(x, v, t, dt)             # depends on ODE solver

    return x                                  # x1 ~ q

Quick References

https://arxiv.org/pdf/2412.06264

End

Prerequisites

Probability

\begin{aligned} &P(X,Y) \quad \text{Joint Probability} \\ &P(X|Y) = \frac{P(X,Y)}{P(Y)} \quad \text{Conditional Probability} \\ &P(X) = \sum_Y P(X,Y) \quad \text{Marginal Probability} \\ &P(X) = \sum_y P(X|Y=y)\, P(Y=y) \\ &P(X|Y) = \frac{P(Y|X)\, P(X)}{P(Y)} \quad \text{Bayes' Theorem} \\ &P(X|Y) = P(X) \quad \text{Independence} \end{aligned}

\begin{aligned} &P(X| y=y) = P(X, Y=y) / P(Y=y) \\ &P(X, Y) = P(X|Y) P(Y) \\ &P(X) = \int P(X, Y) dY \\ &P(X) = \int P(X|Y=y)P(Y=y) dY \\ &P(X) = E_Y[P(X|Y)] \\ &p(X) = E_Y[p(x|Y)] \\ &E[X] = \int x p(x) dx = \int x E_Y[p(x|Y)] dx = E_Y[\int x p(x|Y) dx] = E_Y[E[X|Y]]\\ \end{aligned}

Derivation

\begin{aligned} &\frac{\partial p_t(x|x_1)}{\partial t} + \nabla \cdot \left( p_t(x|x_1) \, u_t(x|x_1) \right) = 0 \\ &\frac{\partial p_t(x)}{\partial t} + \nabla \cdot \left( p_t(x) \, u_t(x) \right) = 0 \tag{B} \\ &p_t(x) = \int p_t(x|x_1) \, q(x_1) \, dx_1 \\ &\frac{\partial p_t(x)}{\partial t} = \int \frac{\partial p_t(x|x_1)}{\partial t} \, q(x_1) \, dx_1 \\ &= -\int \nabla \cdot \left( p_t(x|x_1) \, u_t(x|x_1) \right) q(x_1) \, dx_1 \\ &= -\nabla \cdot \left( \int u_t(x|x_1) \, p_t(x|x_1) \, q(x_1) \, dx_1 \right) \\ &p_t(x) \, u_t(x) = \int u_t(x|x_1) \, p_t(x|x_1) \, q(x_1) \, dx_1 \\ &\boxed{u_t(x) = \int u_t(x|x_1) \, \frac{p_t(x|x_1) \, q(x_1)}{p_t(x)} \, dx_1} \\ &\frac{p_t(x|x_1) \, q(x_1)}{p_t(x)} = p_t(x_1|x) \\ &u_t(x) = \int u_t(x|x_1) \, p_t(x_1|x) \, dx_1 \\ &u_t(x) = \mathbb{E}_{x_1 \sim p_t(x_1|x)} \left[ u_t(x|x_1) \right] \\ \end{aligned}

All equations

\frac{d}{dt}\phi_t(x) = v_t(\phi_t(x))

\phi_0(x) = x

p_t = [\phi_t]_*p_0

[\phi_t]_*p_0(x) = p_0(\phi_t^{-1}(x)) \det\left(\frac{\partial \phi_t^{-1}}{\partial x}(x)\right)

\mathcal{L}_{\text{FM}}(\theta) = \mathbb{E}_{t, p_t(x)} \|v_t(x) - u_t(x)\|^2

All equations

p_t(x) = \int p_t(x|x_1) q(x_1) \, dx_1

p_1(x) = \int p_1(x|x_1) q(x_1) \, dx_1 \approx q(x)

u_t(x) = \int u_t(x|x_1) \frac{p_t(x|x_1) q(x_1)}{p_t(x)} \, dx_1

\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p_t(x|x_1)} \|v_t(x) - u_t(x|x_1)\|^2

p_t(x|x_1) = \mathcal{N}(x \mid \mu_t(x_1), \sigma_t(x_1)^2 I)

All equations

p_t(x|x_1) = \mathcal{N}(x \mid \mu_t(x_1), \sigma_t(x_1)^2 I)

\psi_t(x) = \sigma_t(x_1) x + \mu_t(x_1)

[\psi_t]_*p(x) = p_t(x|x_1)

\frac{d}{dt}\psi_t(x) = u_t(\psi_t(x)|x_1)

\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p(x_0)} \left\| v_t(\psi_t(x_0)) - \frac{d}{dt}\psi_t(x_0) \right\|^2

All equations

u_t(x|x_1) = \frac{\sigma_t'(x_1)}{\sigma_t(x_1)}(x - \mu_t(x_1)) + \mu_t'(x_1)

p_t(x|x_1) = \mathcal{N}(x \mid x_1, \sigma_{1-t}^2 I)

u_t(x|x_1) = -\frac{\sigma_{1-t}'}{\sigma_{1-t}}(x - x_1)

p_t(x|x_1) = \mathcal{N}\left(x \mid \alpha_{1-t} x_1, \left(1 - \alpha_{1-t}^2\right) I\right),\text{ where } \alpha_t = e^{-\frac{1}{2}T(t)}, \quad T(t) = \int_0^t \beta(s) \, ds

u_t(x|x_1) = \frac{\alpha_{1-t}'}{1 - \alpha_{1-t}^2}(\alpha_{1-t} x - x_1) = -\frac{T'(1-t)}{2}\left[\frac{e^{-T(1-t)}x - e^{-\frac{1}{2}T(1-t)}x_1}{1 - e^{-T(1-t)}}\right]

All equations

\mu_t(x) = t x_1, \quad \sigma_t(x) = 1 - (1 - \sigma_{\min})t

u_t(x|x_1) = \frac{x_1 - (1 - \sigma_{\min})x}{1 - (1 - \sigma_{\min})t}

\psi_t(x) = (1 - (1 - \sigma_{\min})t)x + t x_1

\mathcal{L}_{\text{CFM}}(\theta) = \mathbb{E}_{t, q(x_1), p(x_0)} \left\| v_t(\psi_t(x_0)) - \left(x_1 - (1 - \sigma_{\min})x_0\right) \right\|^2

p_t = [(1-t)\text{id} + t\psi]_*p_0

\frac{d}{dt}p_t(x) + \operatorname{div}(p_t(x) v_t(x)) = 0