DDPM equation derivation

This derivation is partly enlightened by 1.

As the DDPM paper describes, the forward process can be stated as following

q(\mathbf{x}_t|\mathbf{x}_{t-1}) := \mathcal{N}(\mathbf{x}_t, \sqrt{1-\beta_t}\mathbf{x}_{t-1}, \beta_tI)

denoting

\alpha_t := 1 - \beta_t
\tilde{\alpha_t} := \prod_{s=1}^t{a_s}

using reparametrization trick

\begin{aligned} q(\mathbf{x}_t|\mathbf{x}_{t-1}) & = \sqrt{1-\beta_t} \mathbf{x}_{t-1} + \sqrt{\beta_t} \boldsymbol{\epsilon} \\ & = \sqrt{\alpha_t} \mathbf{x}_{t-1} + \sqrt{1-\alpha_t} \boldsymbol{\epsilon} \\ &= \sqrt{\alpha_t}\mathbf{x}_{t-1} + \sqrt{(1 - \alpha_t) + \alpha_t (1-\alpha_{t-1})} \boldsymbol{\epsilon} \\ & = \sqrt{\alpha_t \alpha_{t-1}} \mathbf{x}_{t-2} + \sqrt{1-\alpha_t \alpha_{t-1}} \boldsymbol{\epsilon} \\ & = \sqrt{\alpha_t \alpha_{t-1} \alpha_{t-2}} \mathbf{x}_{t-3} + \sqrt{1-\alpha_t \alpha_{t-1} \alpha_{t-2}} \boldsymbol{\epsilon} \\ & = \dots \\ & = \sqrt{\alpha_t \alpha_{t-1} ... \alpha_1} \mathbf{x}_0 + \sqrt{1-\alpha_t \alpha_{t-1} ... \alpha_1 } \boldsymbol{\epsilon} \\ & = \sqrt{\prod_{s=1}^t{a_s}} \mathbf{x}_0 + \sqrt{1-\prod_{s=1}^t{a_s} } \boldsymbol{\epsilon} \\ & = \sqrt{\tilde{\alpha_t}} \mathbf{x}_0 + \sqrt{1-\tilde{\alpha_t} } \boldsymbol{\epsilon} \\ \end{aligned}

The forward process can be stated as following

p(x_{t-1} | x_t) := \mathcal{N}(x_{t-1}; \mu_\theta(x_t, t), \Sigma_\theta(x_t, t))

where we fix the variance \Sigma_\theta(x_t, t) to a certain schedule

using variational lower bound, the loss function

\begin{aligned} -\log{p_\theta(\mathbf{x}_0)} & \leq -\log{p_\theta(\mathbf{x}_0)} + D_{KL}(q(\mathbf{x}_{1:T} | \mathbf{x}_0) || p_\theta (\mathbf{x}_{1:T} | \mathbf{x}_0)) \\ & = -\log{p_\theta(\mathbf{x}_0)} + \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{p_\theta (\mathbf{x}_{1:T} | \mathbf{x}_0)}}} \\ & = -\log{p_\theta(\mathbf{x}_0)} + \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{\frac{p_\theta(\mathbf{x}_0 | \mathbf{x}_{1:T}) p_\theta(\mathbf{x}_{1:T})}{p_\theta(\mathbf{x}_0)}}}} \\ & = -\log{p_\theta(\mathbf{x}_0)} + \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{\frac{p_\theta(\mathbf{x}_0, \mathbf{x}_{1:T})}{p_\theta(\mathbf{x}_0)}}}} \\ & = -\log{p_\theta(\mathbf{x}_0)} + \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{\frac{p_\theta(\mathbf{x}_{0:T})}{p_\theta(\mathbf{x}_0)}}}} \\ & = -\log{p_\theta(\mathbf{x}_0)} + \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{{p_\theta(\mathbf{x}_{0:T})}}}} + \log{{p_\theta(\mathbf{x}_0)}} \\ & = \log{{\frac{q(\mathbf{x}_{1:T} | \mathbf{x}_0)}{{p_\theta(\mathbf{x}_{0:T})}}}} \\ \end{aligned}

though simplified, can not be directly calculated yet.

\begin{aligned} -\log{p_\theta(\mathbf{x}_0)} & = \log{\frac{\prod_{t=1}^T{q(\mathbf{x}_t | \mathbf{x}_{t-1})}}{p(\mathbf{x}_T) \prod_{t=1}^T{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} \\ & = \log{\frac{\prod_{t=1}^T{q(\mathbf{x}_t | \mathbf{x}_{t-1})}}{p(\mathbf{x}_T) \prod_{t=1}^T{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} \\ & = -\log{p(\mathbf{x}_T)} + \log{\frac{\prod_{t=1}^T{q(\mathbf{x}_t | \mathbf{x}_{t-1})}}{\prod_{t=1}^T{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=1}^T{\log{\frac{q(\mathbf{x}_t | \mathbf{x}_{t-1})}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_t | \mathbf{x}_{t-1})}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} + \log{\frac{q(\mathbf{x}_1 | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & \leq -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_t | \mathbf{x}_{t-1}, \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} + \log{\frac{q(\mathbf{x}_1 | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0) q(\mathbf{x}_t | \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t) q(\mathbf{x}_{t-1} | \mathbf{x}_0)}}} + \log{\frac{q(\mathbf{x}_1 | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_t | \mathbf{x}_0)}{q(\mathbf{x}_{t-1} | \mathbf{x}_0)}}} + \log{\frac{q(\mathbf{x}_1 | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} + \log{\frac{q(\mathbf{x}_t | \mathbf{x}_0)}{q(\mathbf{x}_1 | \mathbf{x}_0)}} + \log{\frac{q(\mathbf{x}_1 | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & = -\log{p(\mathbf{x}_T)} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} + \log{\frac{q(\mathbf{x}_t | \mathbf{x}_0)}{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}} \\ & = \log{\frac{q(\mathbf{x}_t | \mathbf{x}_0)}{p(\mathbf{x}_T)}} + \sum_{t=2}^T{\log{\frac{q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0)}{p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t)}}} - \log{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)} \\ & = \underbrace{D_{KL}(q(\mathbf{x}_t | \mathbf{x}_0) || p(\mathbf{x}_T))}_{L_T} + \sum_{t=2}^T{\underbrace{D_{KL}(q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0) || p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t))}_{L_{t-1}}} \underbrace{- \log{p_\theta(\mathbf{x}_0 | \mathbf{x}_1)}}_{L_0} \\ \end{aligned}

where the Kullback–Leibler divergence

D_{KL}(p || q) = \int_x{p(x) \log{{\frac{p(x)}{q(x)}}dx}} \geq 0

measures how similar two distributions are

now LT = D{KL}(q(x_t | x_0) || p(x_T)) describes the similarity between a forward process and a random noise, thus this term can be ignored

within L_{t-1}

q(\mathbf{x}_{t-1} | \mathbf{x}_t, \mathbf{x}_0) \sim \mathcal{N}(\mathbf{x}_{t-1}; \tilde{\boldsymbol{\mu}_t}(\mathbf{x}_t, \mathbf{x}_0), \tilde{\beta_t} I)

where

\begin{aligned} & q(\mathbf{x}_{t-1} \vert \mathbf{x}_t, \mathbf{x}_0) \\ = & q(\mathbf{x}_t \vert \mathbf{x}_{t-1}, \mathbf{x}_0) \frac{ q(\mathbf{x}_{t-1} \vert \mathbf{x}_0) }{ q(\mathbf{x}_t \vert \mathbf{x}_0) } \\ \propto & \exp \Big(-\frac{1}{2} \big(\frac{(\mathbf{x}_t - \sqrt{\alpha_t} \mathbf{x}_{t-1})^2}{\beta_t} + \frac{(\mathbf{x}_{t-1} - \sqrt{\tilde{\alpha}_{t-1}} \mathbf{x}_0)^2}{1-\tilde{\alpha}_{t-1}} - \frac{(\mathbf{x}_t - \sqrt{\tilde{\alpha}_t} \mathbf{x}_0)^2}{1-\tilde{\alpha}_t} \big) \Big) \\ = & \exp \Big(-\frac{1}{2} \big(\frac{\mathbf{x}_t^2 - 2\sqrt{\alpha_t} \mathbf{x}_t {\mathbf{x}_{t-1}} {+ \alpha_t} {\mathbf{x}_{t-1}^2} }{\beta_t} + \frac{ {\mathbf{x}_{t-1}^2} {- 2 \sqrt{\tilde{\alpha}_{t-1}} \mathbf{x}_0} {\mathbf{x}_{t-1}} {+ \tilde{\alpha}_{t-1} \mathbf{x}_0^2} }{1-\tilde{\alpha}_{t-1}} - \frac{(\mathbf{x}_t - \sqrt{\tilde{\alpha}_t} \mathbf{x}_0)^2}{1-\tilde{\alpha}_t} \big) \Big) \\ = & \exp\Big( -\frac{1}{2} \big( {(\frac{\alpha_t}{\beta_t} + \frac{1}{1 - \tilde{\alpha}_{t-1}})} \mathbf{x}_{t-1}^2 - {(\frac{2\sqrt{\alpha_t}}{\beta_t} \mathbf{x}_t + \frac{2\sqrt{\tilde{\alpha}_{t-1}}}{1 - \tilde{\alpha}_{t-1}} \mathbf{x}_0)} \mathbf{x}_{t-1} + C(\mathbf{x}_t, \mathbf{x}_0) \big) \Big) \end{aligned}

thus

\begin{aligned} \tilde{\beta}_t &= 1/(\frac{\alpha_t}{\beta_t} + \frac{1}{1 - \tilde{\alpha}_{t-1}}) = 1/(\frac{\alpha_t - \tilde{\alpha}_t + \beta_t}{\beta_t(1 - \tilde{\alpha}_{t-1})}) = {\frac{1 - \tilde{\alpha}_{t-1}}{1 - \tilde{\alpha}_t} \cdot \beta_t} \\ \tilde{\boldsymbol{\mu}}_t (\mathbf{x}_t, \mathbf{x}_0) &= (\frac{\sqrt{\alpha_t}}{\beta_t} \mathbf{x}_t + \frac{\sqrt{\tilde{\alpha}_{t-1} }}{1 - \tilde{\alpha}_{t-1}} \mathbf{x}_0)/(\frac{\alpha_t}{\beta_t} + \frac{1}{1 - \tilde{\alpha}_{t-1}}) \\ &= (\frac{\sqrt{\alpha_t}}{\beta_t} \mathbf{x}_t + \frac{\sqrt{\tilde{\alpha}_{t-1} }}{1 - \tilde{\alpha}_{t-1}} \mathbf{x}_0) {\frac{1 - \tilde{\alpha}_{t-1}}{1 - \tilde{\alpha}_t} \cdot \beta_t} \\ &= \frac{\sqrt{\alpha_t}(1 - \tilde{\alpha}_{t-1})}{1 - \tilde{\alpha}_t} \mathbf{x}_t + \frac{\sqrt{\tilde{\alpha}_{t-1}}\beta_t}{1 - \tilde{\alpha}_t} \mathbf{x}_0\\ &= \frac{\sqrt{\alpha_t}(1 - \tilde{\alpha}_{t-1})}{1 - \tilde{\alpha}_t} \mathbf{x}_t + \frac{\sqrt{\tilde{\alpha}_{t-1}}\beta_t}{1 - \tilde{\alpha}_t} \frac{1}{\sqrt{\tilde{\alpha}_t}}(\mathbf{x}_t - \sqrt{1 - \tilde{\alpha}_t}\boldsymbol{\epsilon}) \\ &= \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \tilde{\alpha}_t}} \boldsymbol{\epsilon} ) \end{aligned}

also within L_{t-1}

p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t) = \mathcal{N}(\mathbf{x}_{t-1}; \boldsymbol{\mu}_\theta(\mathbf{x}_t, t), \boldsymbol{\Sigma}_\theta(\mathbf{x}_t, t))

now define the loss as MSE of \tilde{\boldsymbol{\mu}_t}(\mathbf{x}_t, \mathbf{x}0) and \boldsymbol{\mu}\theta(\mathbf{x}_t, t)

L_t = \frac{1}{2 \sigma_t^2}||\tilde{\boldsymbol{\mu}_t}(\mathbf{x}_t, \mathbf{x}_0) - \boldsymbol{\mu}_\theta(\mathbf{x}_t, t)||^2

now that we need to train \boldsymbol{\mu}_\theta to predict \tilde{\boldsymbol{\mu}_t} = \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \tilde{\alpha}_t}} \boldsymbol{\epsilon} ), we can reparameterize the Gaussian noise term instead to make it predict \boldsymbol{\epsilon}_t from the input \mathbf{x}_t at time step t

\boldsymbol{\mu}_\theta(\mathbf{x}_t, t) = \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \bar{\alpha}_t}} \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t) )

now we can rewrite

\begin{aligned} L_t & = \frac{1}{2 \sigma_t^2}||\tilde{\boldsymbol{\mu}_t}(\mathbf{x}_t, \mathbf{x}_0) - \boldsymbol{\mu}_\theta(\mathbf{x}_t, t)||^2 \\ & = \frac{1}{2 \sigma_t^2}||\tilde{\boldsymbol{\mu}_t}(\frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \tilde{\alpha}_t}} \boldsymbol{\epsilon} )) - \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \bar{\alpha}_t}} \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t) )||^2 \\ & = \frac{\beta_t^2}{2\sigma_t^2 (1-\tilde{a_t})} ||\boldsymbol{\epsilon} - \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t)||^2 \end{aligned}

finally

\begin{aligned} p_\theta(\mathbf{x}_{t-1} | \mathbf{x}_t) &= \mathcal{N}(\mathbf{x}_{t-1}; \boldsymbol{\mu}_\theta(\mathbf{x}_t, t), \boldsymbol{\Sigma}_\theta(\mathbf{x}_t, t)) \\ &= \mathcal{N}(\mathbf{x}_{t-1}; \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \bar{\alpha}_t}} \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t) ), \beta_t) \\ \end{aligned}

leading to

\mathbf{x}_{t-1} = \frac{1}{\sqrt{\alpha_t}} ( \mathbf{x}_t - \frac{1 - \alpha_t}{\sqrt{1 - \bar{\alpha}_t}} \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t) ) + \sqrt{\beta_t} \boldsymbol{\epsilon}

L0 = - \log{p\theta(\mathbf{x}_0 | \mathbf{x}_1)} can be minimized if we have a close enough prediction of \mathbf{x}_0 with \mathbf{x}_1.

Reference


  1. L. Weng, “What are diffusion models?” lilianweng.github.io, Jul 2021. [Online]. Available: https://lilianweng.github.io/posts/2021-07-11-diffusion-models/